ml-ass2
In [14]:
import scipy.io as sio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [15]:
arr = sio.loadmat(‘DataD.mat’)
fea = arr[‘fea’]
gnd = arr[‘gnd’]
gnd = gnd[:,0]
1)¶
In [16]:
from sklearn import preprocessing
zscoreScaler = preprocessing.StandardScaler()
normalizedFea = pd.DataFrame(zscoreScaler.fit_transform(fea))
In [17]:
halfNum = normalizedFea.shape[0]//2
trainFea = normalizedFea[:halfNum]
testFea = normalizedFea[halfNum+1:]
trainGnd = gnd[:halfNum]
testGnd = gnd[halfNum+1:]
2)¶
In [18]:
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
nc = KNeighborsClassifier()
In [19]:
ks = np.arange(1, 32, 2)
accuracies = []
for k in ks:
nc.n_neighbors = k
accu = cross_val_score(nc, trainFea, trainGnd, cv=5, n_jobs=1, scoring = ‘accuracy’)
accuracies.append(np.mean(accu))
In [20]:
plt.plot(ks, accuracies)
plt.xlabel(‘k’)
plt.ylabel(‘5 fold mean accuracy’)
plt.title(‘K vs Accuracy’)
Out[20]:
In [21]:
ind = np.argmax(accuracies)
print(ks[ind])
print(accuracies[ind])
11
0.712784392615
In [22]:
nc.n_neighbors = ks[ind]
nc.fit(trainFea, trainGnd)
predict = nc.predict(testFea)
accu = np.sum(predict == testGnd)*1.0 / testGnd.shape[0]
print(accu)
0.712465878071
Answer¶
The relationship between the accuracy and the parameter k is shown in above figure.
The best K found is 11 with 5-fold mean accuracy 0.712784392615. The accuracy on testing data is 0.712465878071.
3)¶
In [13]:
from sklearn.svm import SVC
cs = [0.1, 0.5, 1, 2, 5, 10, 20, 50]
sigmas = [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]
accuracies = []
for c in cs:
for sigma in sigmas:
# gamma=-1/(2*sigma*sigma)
accu = cross_val_score(SVC(C=c, gamma=1.0/(2*sigma*sigma)), trainFea, trainGnd, cv=5,
n_jobs=1, scoring = ‘accuracy’)
accuracies.append(np.mean(accu))
In [11]:
paras = []
for c in cs:
for sigma in sigmas:
paras.append((c, sigma))
ind = np.argmax(accuracies)
print(paras[ind])
print(accuracies[ind])
(2, 5)
0.890944534917
In [12]:
from sklearn.metrics import roc_curve, auc
from scipy import interp
def cross_validation_AUC(trainX, trainY, numFold, classifier, name):
plt.figure()
# plt.clf()
kfold = KFold(trainY.shape[0], n_folds=numFold)
true_positive_rate_mean = 0.0
false_positive_rate_mean = np.linspace(0, 1, 100)
for i, (train_part, validate_part) in enumerate(kfold):
# print(train_part)
predictProbability = classifier.fit(trainX[train_part, :], trainY[train_part]).predict_proba(trainX[validate_part,:])
false_positive_rate, true_positive_rate, thresholds = roc_curve(trainY[validate_part], predictProbability[:, 1])
true_positive_rate_mean += interp(false_positive_rate_mean, false_positive_rate, true_positive_rate)
true_positive_rate_mean[0] = 0.0
area_under_curve = auc(false_positive_rate, true_positive_rate)
plt.plot(false_positive_rate, true_positive_rate, lw=1, label=’Fold %d ROC (AUC = %0.3f)’ % (i, area_under_curve))
plt.plot([0, 1], [0, 1], ‘–‘, color=(0.3, 0.3, 0.3))
true_positive_rate_mean /= len(kfold)
true_positive_rate_mean[-1] = 1.0
mean_auc = auc(false_positive_rate_mean, true_positive_rate_mean)
plt.plot(false_positive_rate_mean, true_positive_rate_mean, ‘k–‘,
label=’Mean ROC (AUC = %0.3f)’ % mean_auc, lw=2)
extra = 0.03
plt.xlim([-extra, 1 + extra])
plt.ylim([-extra, 1 + extra])
plt.xlabel(‘False Positive Rate’)
plt.ylabel(‘True Positive Rate’)
plt.title(‘Receiver operating characteristic for ‘ + name)
plt.legend(loc=”lower right”)
plt.show()
In [13]:
cross_validation_AUC(trainFea.values, trainGnd, 5, SVC(C=paras[ind][0], gamma=1.0/(2*paras[ind][1]*paras[ind][1]), probability=True), ‘SVM’)
Answer¶
The best parameter found is C=2 and sigma = 5 which as the highest average 5-fold accuracy 0.890944534917.
The ROC curves are shown in above figure.
4)¶
a)¶
In [14]:
nc = KNeighborsClassifier()
nc.n_neighbors = 11
nc.fit(trainFea, trainGnd)
predict = nc.predict(testFea)
accu = np.sum(predict == testGnd)*1.0 / testGnd.shape[0]
print(accu)
0.712465878071
In [15]:
svm = SVC(C=2, gamma=1.0/(2*5*5))
svm.fit(trainFea, trainGnd)
predict = svm.predict(testFea)
accu = np.sum(predict == testGnd)*1.0 / testGnd.shape[0]
print(accu)
0.902638762511
In [16]:
from sklearn import tree
decisionTree = tree.DecisionTreeClassifier()
decisionTree.fit(trainFea, trainGnd)
predict = decisionTree.predict(testFea)
accu = np.sum(predict == testGnd)*1.0 / testGnd.shape[0]
print(accu)
0.914467697907
In [17]:
from sklearn import ensemble
rndForest = ensemble.RandomForestClassifier()
rndForest.fit(trainFea, trainGnd)
predict = rndForest.predict(testFea)
accu = np.sum(predict == testGnd)*1.0 / testGnd.shape[0]
print(accu)
0.943585077343
In [18]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier(hidden_layer_sizes=(10))
nn.fit(trainFea, trainGnd)
predict = nn.predict(testFea)
accu = np.sum(predict == testGnd)*1.0 / testGnd.shape[0]
print(accu)
0.884440400364
/Users/vagrant/anaconda42/anaconda/lib/python2.7/site-packages/sklearn/neural_network/multilayer_perceptron.py:563: ConvergenceWarning: Stochastic Optimizer: Maximum iterations reached and the optimization hasn’t converged yet.
% (), ConvergenceWarning)
b)¶
In [19]:
rndf6 = ensemble.RandomForestClassifier(n_estimators=50)
rndf6.fit(trainFea, trainGnd)
predict = rndf6.predict(testFea)
accu = np.sum(predict == testGnd)*1.0 / testGnd.shape[0]
print(accu)
0.968152866242
C)¶
In [20]:
import time
import random
def experimentOne(classifier):
t = list(range(normalizedFea.shape[0]))
random.shuffle(t)
shuffledFea = normalizedFea.values[t, :]
shuffledGnd = gnd[t]
trainFea = shuffledFea[:halfNum]
testFea = shuffledFea[halfNum+1:]
trainGnd = shuffledGnd[:halfNum]
testGnd = shuffledGnd[halfNum+1:]
trainStart = time.time()
classifier.fit(trainFea, trainGnd)
trainTime = time.time() – trainStart
predictStart = time.time()
predict = classifier.predict(testFea)
classificationTime = time.time() – predictStart
accu = np.sum(predict == testGnd)*1.0 / testGnd.shape[0]
tp = np.sum(np.logical_and(predict == 1, testGnd == 1))
fn = np.sum(np.logical_and(predict == -1, testGnd == 1))
fp = np.sum(np.logical_and(predict == 1, testGnd == -1))
tn = np.sum(np.logical_and(predict == -1, testGnd == -1))
precision = tp*1.0 / (tp + fp)
recall = tp*1.0 / (tp + fn)
f_measure = 2*precision*recall / (precision + recall)
return np.array([accu, precision, recall, f_measure, trainTime,
classificationTime])
def experimentAll(classifier):
res = np.zeros((20, 6))
for i in range(20):
res[i, :] = experimentOne(classifier)
m = np.mean(res, axis = 0)
std = np.std(res, axis = 0)
return pd.DataFrame([m, std],
columns = [‘accuracy’, ‘precision’, ‘recall’, ‘F-Measure’, ‘training time’, ‘classification time’], index = [‘mean’, ‘std’])
In [21]:
experimentAll(KNeighborsClassifier(n_neighbors = 11))
Out[21]:
accuracy precision recall F-Measure training time classification time
mean 0.739536 0.936996 0.535390 0.679936 0.001589 0.089260
std 0.019686 0.018847 0.043172 0.032322 0.000174 0.000873
In [22]:
experimentAll(SVC(C=2, gamma=1.0/(2*5*5)))
Out[22]:
accuracy precision recall F-Measure training time classification time
mean 0.895496 0.908392 0.887719 0.897805 0.077989 0.045281
std 0.008474 0.014998 0.012223 0.008271 0.001504 0.002458
In [23]:
experimentAll(decisionTree)
Out[23]:
accuracy precision recall F-Measure training time classification time
mean 0.919654 0.929093 0.915528 0.922197 0.005601 0.000217
std 0.010510 0.013250 0.012537 0.010369 0.000626 0.000047
In [24]:
experimentAll(rndForest)
Out[24]:
accuracy precision recall F-Measure training time classification time
mean 0.920746 0.955454 0.888135 0.920390 0.038334 0.006132
std 0.011214 0.014407 0.019153 0.011621 0.001282 0.000292
In [25]:
experimentAll(nn)
Out[25]:
accuracy precision recall F-Measure training time classification time
mean 0.868881 0.888232 0.853271 0.870251 0.395548 0.000305
std 0.010322 0.015082 0.015134 0.009927 0.045165 0.000044
In [26]:
experimentAll(rndf6)
Out[26]:
accuracy precision recall F-Measure training time classification time
mean 0.957871 0.977050 0.940780 0.958518 0.196943 0.030652
std 0.006357 0.007779 0.010934 0.006292 0.009711 0.001895
5)¶
From the results of 4), we can see that the sixth classifier using random forest with n_estimators=50 has the highest accuracy, precision, recall and F-Measure.
The benefit of KNN is that it is simple, weaknesses of it is that it has relative longer testing time, has poor performance for this data set and also has large standard deviation for accuracy.
The benefit of SVM is that it has fast testing time, weaknesses of it is that it has relative longer training time and its performance is not very good for this data set.
The benefit of decison tree is that it has a relative good performance for this data set and also has fast training and testing time. Weaknesses of it is that it doesn’t have the best performance.
The benefit of neural network is that it can train a very complex model and has amazing learning ability. The weakness is that it takes much longer time to train and need much larger data to train. It is also triky to adjust the parameters.
The benefit of random forest is that it is good at reducing overfitting and has a very good performance. The weakness of it is that it may take a little longer to train.
From above analysis, the best method is to use random forest classifier for a dataset of this type in the future.
In [ ]: