程序代写代做代考 matlab In [2]:

In [2]:

import scipy.io as sio
import pandas as pd
import numpy as np

In [3]:

arr = sio.loadmat(‘/Users/vagrant/tasks-2017/wangwang-pingdan_21/matlab-1600/a
ssignment1/asg1-2017/DataA.mat’)

In [4]:

fea = arr[‘fea’]

In [5]:

fea.shape

In [6]:

fea

In [7]:

df = pd.DataFrame(fea);

In [8]:

df.to_csv(‘fea.csv’)

In [9]:

df[df.isnull().any(axis=1)].shape

Out[5]:

(19000, 81)

Out[6]:

array([[-153., 414., 939., …, -29., 36., 24.],
[-150., 420., 939., …, -31., 47., 3.],
[-160., 432., 941., …, -38., 20., 0.],
…,
[ nan, nan, nan, …, nan, nan, nan],
[ nan, nan, nan, …, nan, nan, nan],
[ nan, nan, nan, …, nan, nan, nan]])

Out[9]:

(19000, 81)

In [29]:

df[df.isnull().all(axis=1)].shape

I. I. Data Cleaning and Preprocessing (for dataset A)

1
The dataset has 19000 rows and 81 columns. All rows have at least 1 missing value. There are 773 rows
whose 81 values are all missing.

2
I remove the 773 rows whose 81 values are all missing and fill other missing value with the mean value of
that column.

In [28]:

dfAfterRemoveEmptyRows = df.ix[~(df.isnull().all(axis=1))]
processed = dfAfterRemoveEmptyRows.fillna(df.mean())
processed

Out[29]:

(773, 81)

Out[28]:

0 1 2 3 4 5

0 -153.000000 414.000000 939.000000 -161.000000 1007.000000 99.000000

1 -150.000000 420.000000 939.000000 -177.000000 1008.000000 103.000000

2 -160.000000 432.000000 941.000000 -162.000000 982.000000 98.000000

3 -171.000000 432.000000 911.000000 -174.000000 999.000000 115.000000

4 -171.000000 698.264485 929.000000 -189.000000 1004.000000 104.000000

5 -171.000000 432.000000 924.000000 -179.000000 1011.000000 85.000000

6 -169.000000 429.000000 949.000000 -175.000000 1007.000000 102.000000

7 -160.000000 423.000000 927.000000 -195.000000 996.000000 123.000000

8 -163.000000 432.000000 929.000000 -178.000000 994.000000 101.000000

9 -156.000000 415.000000 936.000000 -186.000000 1014.000000 111.000000

10 -153.000000 413.000000 923.000000 -187.000000 993.000000 91.000000

11 -168.000000 412.000000 904.000000 -194.000000 989.000000 115.000000

12 -166.000000 442.000000 926.000000 -191.000000 1001.000000 114.000000

13 -162.000000 447.000000 920.000000 -218.000000 1000.000000 110.000000

14 -184.000000 442.000000 941.000000 -237.000000 992.000000 144.000000

15 -157.000000 427.000000 925.000000 -245.000000 986.000000 127.000000

16 -158.000000 427.000000 905.000000 -218.000000 990.000000 111.000000

17 -153.000000 451.000000 889.000000 -260.000000 967.000000 112.000000

18 -150.000000 443.000000 928.000000 -243.000000 968.000000 130.000000

19 -151.000000 442.000000 930.000000 -260.000000 991.000000 92.000000

20 -153.000000 462.000000 940.000000 -253.000000 977.000000 135.000000

21 -170.000000 459.000000 927.000000 -247.000000 959.000000 133.000000

22 -148.000000 453.000000 923.000000 -274.000000 992.000000 142.000000

23 -154.000000 449.000000 928.000000 -274.000000 948.000000 115.000000

24 -165.000000 462.000000 918.000000 -302.000000 974.000000 129.000000

25 -187.000000 479.000000 926.000000 -304.000000 966.000000 122.000000

26 -169.000000 473.000000 947.000000 -305.000000 965.000000 123.000000

27 -172.000000 482.000000 941.000000 -314.000000 946.000000 163.000000

28 -203.000000 486.000000 917.000000 -314.000000 972.000000 145.000000

29 -200.000000 491.000000 898.000000 -312.000000 978.000000 123.000000

… … … … … … …

18197 -132.812384 698.264485 597.541402 -102.000000 967.000000 73.000000

18198 -132.812384 698.264485 597.541402 -106.000000 1130.000000 85.000000

18199 -132.812384 698.264485 597.541402 0.000000 0.000000 0.000000

18200 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18201 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18202 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18203 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18204 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18205 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18206 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18207 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18208 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18209 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18210 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18211 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18212 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18213 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

In [48]:

from sklearn import preprocessing
minMaxScaler = preprocessing.MinMaxScaler()
minMaxScaledDf = pd.DataFrame(minMaxScaler.fit_transform(processed))

zscoreScaler = preprocessing.StandardScaler()
zscoreDf = pd.DataFrame(zscoreScaler.fit_transform(processed))

zscoreDf = pd.DataFrame(preprocessing.scale(processed))

minMaxScaledDf

18214 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18215 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18216 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18217 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18218 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18219 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18220 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18221 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18222 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18223 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18224 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18225 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18226 -132.812384 698.264485 597.541402 -307.128462 909.548077 -32.760824

18227 rows × 81 columns

In [51]:

import matplotlib.pyplot as plt

for featureInd in (9,24):
# featureInd = 9
processed.ix[:, featureInd – 1].hist()
plt.title(‘Feature %d histogram before normalization’ % featureInd)
plt.show()

plt.figure()
minMaxScaledDf.ix[:, featureInd – 1].hist()
plt.title(‘Feature %d histogram after min-max normalization’ % featureInd)
plt.show()

plt.figure()
zscoreDf.ix[:, featureInd – 1].hist()
plt.title(‘Feature %d histogram after z-score normalization’ % featureInd)
plt.show()

In [61]:

from pandas.tools.plotting import autocorrelation_plot
for featureInd in (9,24):
autocorrelation_plot(processed.ix[:, featureInd-1])
plt.title(‘Feature %d auto-correlation plot before normalization’ % featur
eInd)
plt.show()

autocorrelation_plot( minMaxScaledDf.ix[:, featureInd – 1])
plt.title(‘Feature %d auto-correlation plot after min-max normalization’ %
featureInd)
plt.show()

autocorrelation_plot( zscoreDf.ix[:, featureInd – 1])
plt.title(‘Feature %d auto-correlation plot after z-score normalization’ %
featureInd)
plt.show()

II. Feature Extraction (for dataset B)

1. Use PCA as a dimensionality reduction technique to the
data, compute the eigenvectors and eigenvalues.

In [65]:

arr = sio.loadmat(‘/Users/vagrant/tasks-2017/wangwang-pingdan_21/matlab-1600/a
ssignment1/asg1-2017/DataB.mat’)
arr

In [66]:

feaDf = pd.DataFrame(arr[‘fea’]);
gndDf = pd.DataFrame(arr[‘gnd’]);

In [88]:

from sklearn.decomposition import PCA
pca = PCA()
trans = pca.fit(feaDf).transform(feaDf)
trans = pd.DataFrame(trans)
trans

Out[65]:

{‘__globals__’: [],
‘__header__’: ‘MATLAB 5.0 MAT-file, Platform: PCWIN64, Created on
: Wed Sep 24 09:53:02 2014’,
‘__version__’: ‘1.0’,
‘fea’: array([[4, 4, 3, …, 1, 4, 5],
[5, 1, 4, …, 3, 5, 4],
[1, 3, 0, …, 1, 2, 4],
…,
[2, 3, 2, …, 1, 1, 3],
[5, 2, 4, …, 4, 3, 4],
[3, 3, 1, …, 1, 3, 1]], dtype=uint8),
‘gnd’: array([[0],
[0],
[0],
…,
[4],
[4],
[4]], dtype=uint8)}

Out[88]:

0 1 2 3 4 5

0 -1069.166304 -513.973184 -139.243261 878.387704 387.873484 -335.304982

1 -1099.176077 -570.842223 -67.311779 839.381070 345.573249 -530.737220

2 -673.201385 -167.377150 480.988638 83.823068 1036.833666 76.531663

3 -1010.903339 -187.044145 506.352247 426.446929 901.897549 73.661148

4 -1692.970822 -633.369398 -521.943052 367.356716 -6.919257 -601.851221

5 -1341.694310 -536.770010 -578.489504 246.425866 -577.933576 140.191582

6 -1217.832623 -521.312900 -116.355397 240.552050 680.648563 -501.999433

7 -226.761138 -457.324861 -284.321899 128.615747 -1017.001492 -87.023369

8 -1219.222302 -479.274443 8.018715 -521.169429 722.007449 195.983302

9 -900.753262 -548.126694 30.258637 46.963719 761.675828 -435.396163

10 -1119.444673 -430.540486 -4.091402 150.580695 1045.936651 -171.263259

11 -1132.061141 -688.199951 -534.771525 -8.874299 -914.504627 85.645282

12 -1283.397863 -396.151025 187.789686 40.171936 787.148009 63.436551

13 -431.797862 -454.763166 -364.204768 166.199142 -1016.920930 280.651043

14 -1368.131762 -641.515614 -732.070197 619.218875 -244.318899 -49.976603

15 -784.250675 -419.946331 -589.587363 470.526795 -85.742394 -123.237859

16 -1455.821769 -519.039116 109.383460 961.047935 647.104030 -564.693429

17 -1406.742684 -1010.360551 -236.296620 302.702769 -1052.810187 99.689728

18 -809.569532 -355.676642 -553.061747 739.156447 -342.576671 -298.188245

19 -1009.720480 -500.758620 148.698733 -206.740212 942.652635 179.051437

20 -1256.737747 -136.282075 4.767105 401.171633 111.170531 20.126463

21 -1097.738331 -791.366616 -662.612802 416.147079 -787.374203 -393.630293

22 -1237.245760 -1011.960330 -384.654272 69.297448 -1141.412452 115.369106

23 -337.006350 -166.084273 -519.665873 130.820076 -403.671160 -213.609312

24 -622.921863 -177.985457 -414.160463 -401.750020 2.516656 192.630114

25 -1058.479796 -679.243567 -567.416761 49.160913 -884.302279 17.978966

26 -170.420127 205.716202 -134.305883 -28.439419 90.064382 81.864711

27 -937.063555 -120.700099 -159.267905 29.237288 -114.534026 -366.615037

28 -232.650631 -283.345693 -543.440591 257.837187 -835.285216 -363.820625

29 -899.559652 -780.081817 -483.711259 169.414276 -1077.021039 151.471500

… … … … … … …

2036 224.500021 717.688537 -195.952977 -499.323792 -267.286036 -368.337387

2037 -435.315514 1060.086518 -252.240653 -784.207808 17.898867 -426.553471

2038 238.402523 821.431429 -296.668932 72.066998 302.033654 -169.441235

2039 479.799398 304.242620 -530.251819 -263.518464 -18.481148 -269.480921

2040 -85.398938 899.484460 -359.165698 -478.939958 30.973590 -421.339300

2041 -975.896906 891.969722 -658.015458 -273.613739 -529.811171 -94.804905

2042 -626.495241 838.788408 -639.081008 212.654848 -340.914705 80.270632

2043 258.028617 579.227378 -335.781308 -127.985693 149.696587 -147.257143

2044 252.991333 361.106673 -45.994668 -275.875656 123.818071 46.475332

2045 -43.933473 596.114295 -360.973687 -150.061260 152.710758 44.882070

In [78]:

feaDf[gndDf.ix[:,0] == 1]
gndDf.max()

2046 8.716199 710.432025 -23.709776 -105.468206 -499.729993 -119.021552

2047 432.475888 548.321160 -174.616090 99.150459 -3.700452 -295.330003

2048 -164.177255 959.226629 -239.753319 -546.282908 -434.398609 -269.636934

2049 19.806668 944.896452 -408.627564 -141.635179 282.084948 -27.051497

2050 260.650271 726.328779 -374.089981 326.390047 -251.397107 -172.193151

2051 337.640613 568.111744 -358.456523 -207.139564 -262.826142 -492.078627

2052 111.635685 782.005670 -337.485075 -404.216222 158.940143 -295.307423

2053 575.789733 321.964107 193.327055 -4.403837 546.809983 104.206169

2054 -56.731706 983.452324 -101.364217 -575.686798 191.904316 -464.280977

2055 391.142965 529.109716 -360.665581 34.201602 227.983142 -82.175911

2056 222.286099 790.011945 -279.594895 -583.068983 -311.732657 -212.255003

2057 432.180822 776.933460 49.309403 -306.113951 153.785454 -115.321890

2058 -141.001824 566.194886 -347.955180 69.169126 -373.710178 241.262334

2059 -467.623465 718.782394 -152.832548 -135.848772 344.854140 76.905198

2060 -514.275826 1049.482202 -256.796813 -906.542691 -121.841814 -276.421567

2061 24.355662 742.490057 -467.186538 -255.047804 -157.302478 -239.561076

2062 -48.768593 734.458335 -334.353122 -496.721083 117.983630 -467.477372

2063 -131.021601 866.607035 -397.861565 -248.089962 45.492451 -93.904547

2064 262.141229 652.777351 -347.602739 72.427962 -80.070774 -164.182531

2065 480.891094 432.743142 18.124027 -364.053056 502.566777 428.276701

2066 rows × 784 columns

Out[78]:

0 4
dtype: uint8

In [89]:

colors = [‘navy’, ‘turquoise’, ‘darkorange’, ‘red’, ‘green’]
classes = [0, 1, 2, 3, 4]
classesStr = [‘class ‘ + str(i) for i in classes]
for color, i, class_name in zip(colors, classes, classesStr):
t = trans[gndDf.ix[:,0] == i]
plt.scatter(t.ix[:,0], t.ix[:,1], color=color, alpha=.8,
label=class_name)

plt.legend(loc=’best’, shadow=False, scatterpoints=1)
plt.title(‘2 dimensional representation with first and second principal compon
ents’)
plt.show()

In [90]:

for color, i, class_name in zip(colors, classes, classesStr):
t = trans[gndDf.ix[:,0] == i]
plt.scatter(t.ix[:,4], t.ix[:,5], color=color, alpha=.8,
label=class_name)

plt.legend(loc=’best’, shadow=False, scatterpoints=1)
plt.title(‘2 dimensional representation with first and second principal compon
ents’)
plt.show()

In [101]:

from sklearn.naive_bayes import GaussianNB

class_errors = []
retainedVars = []
for n in [2, 4, 10, 30, 60, 200, 500,784]:
gnb = GaussianNB()
y_pred = gnb.fit(trans.ix[:,:n], gndDf.ix[:,0]).predict(trans.ix[:,:n])
class_error = sum(y_pred != gndDf.ix[:,0])*1.0 / y_pred.shape[0]
class_errors.append(class_error)
retainedVars.append(sum(pca.explained_variance_[:n]) / sum(pca.explained_v
ariance_))

plt.plot(retainedVars, class_errors)
plt.xlabel(‘retained variance’)
plt.ylabel(‘classication error’)
plt.title(‘Naive Bayes Classification Error’)
plt.show()

In [102]:

In [103]:

In [ ]:

Out[102]:

[0.17328170377541141,
0.10067763794772508,
0.0701839303000968,
0.059535333978702809,
0.047918683446272994,
0.059535333978702809,
0.19070667957405615,
0.23233301064859632]

Out[103]:

[0.2208652726402634,
0.34489263092230382,
0.54222178827782175,
0.76145556146398985,
0.87275854960645693,
0.97511626228772696,
0.99975130751258978,
1.0]