In [1]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from sklearn import preprocessing
In [2]:
### Read ParkinsonsDiseaseBio_DATA_2016-01-21_2132_MoCA_HDRS_UPDRS.csv
df = pd.read_csv(“ParkinsonsDiseaseBio_DATA_2016-01-21_2132_MoCA_HDRS_UPDRS.csv”)
### precess empty value, fill empty with mean
df = df.fillna(df.mean())
df
Out[2]:
study_id
redcap_event_name
moca_able
moca_visuospatial_exec
moca_naming
moca_list_of_digits
moca_list_of_letters
moca_serial_7_subtraction
moca_repeat
moca_fluency
…
hdrs_genital_sx
hdrs_hypochondriasis
hdrs_loss_of_weight
hdrs_insight
hdrs_total_score_calc
hamilton_depression_rating_scale_complete
upsit_able
upsit_score
upsit_percentile
university_of_pennsylvania_smell_identification_test_complete
0
1
baseline_visit_arm_1
0.958537
5.000000
3.000000
2.000000
1.000000
3.000000
2.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
1.000000
2
0.910256
33.000000
34.000000
2.00
1
1
6_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
2
0.910256
24.673428
28.981744
1.74
2
1
12_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
0.00
3
2
baseline_visit_arm_1
0.958537
5.000000
3.000000
2.000000
1.000000
3.000000
1.000000
1.000000
…
1.000000
0.000000
0.000000
0.000000
1.000000
2
0.910256
37.000000
70.000000
2.00
4
2
6_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
2
0.910256
24.673428
28.981744
1.74
5
2
12_month_followup_arm_1
0.958537
5.000000
3.000000
2.000000
1.000000
3.000000
1.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
0.000000
2
0.910256
35.000000
42.000000
2.00
6
2
18_month_followup_arm_1
0.958537
5.000000
3.000000
2.000000
1.000000
3.000000
2.000000
1.000000
…
1.000000
0.000000
0.000000
0.000000
2.000000
2
0.910256
24.673428
28.981744
1.74
7
2
24_month_followup_arm_1
0.958537
4.000000
3.000000
1.000000
1.000000
3.000000
1.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
0.000000
2
0.910256
30.000000
27.000000
2.00
8
2
30_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.000000
0.000000
0.000000
0.000000
0.000000
2
0.910256
24.673428
28.981744
1.74
9
2
36_month_followup_arm_1
0.958537
5.000000
3.000000
2.000000
1.000000
3.000000
1.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
0.000000
2
0.910256
32.000000
23.000000
2.00
10
3
baseline_visit_arm_1
0.958537
5.000000
3.000000
2.000000
1.000000
3.000000
2.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
0.000000
2
0.910256
34.000000
23.000000
2.00
11
3
6_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
2
0.910256
24.673428
28.981744
1.74
12
3
12_month_followup_arm_1
0.958537
4.000000
3.000000
2.000000
1.000000
3.000000
2.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
2.000000
2
0.910256
37.000000
51.000000
2.00
13
3
18_month_followup_arm_1
0.958537
4.000000
3.000000
2.000000
1.000000
2.000000
2.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
0.000000
2
0.910256
24.673428
28.981744
1.74
14
3
24_month_followup_arm_1
0.958537
4.000000
3.000000
2.000000
1.000000
3.000000
2.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
0.000000
2
0.910256
33.000000
18.000000
2.00
15
3
30_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.000000
0.000000
0.000000
0.000000
0.000000
2
0.910256
24.673428
28.981744
1.74
16
3
36_month_followup_arm_1
0.958537
5.000000
2.000000
2.000000
1.000000
3.000000
2.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
0.000000
2
0.910256
33.000000
21.000000
2.00
17
4
baseline_visit_arm_1
0.000000
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
2
0.000000
24.673428
28.981744
2.00
18
4
6_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
1.74
19
5
baseline_visit_arm_1
1.000000
3.000000
3.000000
2.000000
1.000000
3.000000
1.000000
1.000000
…
0.000000
3.000000
0.000000
0.000000
11.000000
2
1.000000
10.000000
0.000000
2.00
20
5
6_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
2
0.910256
24.673428
28.981744
1.74
21
5
12_month_followup_arm_1
1.000000
0.000000
2.000000
1.000000
0.000000
1.000000
1.000000
0.000000
…
0.000000
1.000000
0.000000
0.000000
9.000000
2
1.000000
16.000000
10.000000
2.00
22
5
18_month_followup_arm_1
1.000000
1.000000
3.000000
1.000000
0.000000
2.000000
2.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
4.000000
2
0.910256
24.673428
28.981744
1.74
23
5
24_month_followup_arm_1
1.000000
1.000000
2.000000
2.000000
1.000000
3.000000
1.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
8.000000
2
1.000000
15.000000
8.000000
2.00
24
5
30_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.000000
0.000000
1.000000
0.000000
11.000000
2
0.910256
24.673428
28.981744
1.74
25
5
36_month_followup_arm_1
1.000000
2.000000
3.000000
2.000000
1.000000
0.000000
0.000000
1.000000
…
0.000000
0.000000
1.000000
0.000000
9.000000
2
1.000000
12.000000
6.000000
2.00
26
6
baseline_visit_arm_1
1.000000
3.000000
3.000000
2.000000
1.000000
3.000000
1.000000
1.000000
…
1.000000
0.000000
2.000000
0.000000
10.000000
2
1.000000
14.000000
0.000000
2.00
27
6
6_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
1.74
28
6
12_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
0.00
29
6
18_month_followup_arm_1
1.000000
1.000000
3.000000
2.000000
1.000000
2.000000
2.000000
1.000000
…
2.000000
0.000000
1.000000
0.000000
7.000000
2
0.910256
24.673428
28.981744
1.74
…
…
…
…
…
…
…
…
…
…
…
…
…
…
…
…
…
…
…
…
…
…
1082
246
24_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
0.00
1083
246
30_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
1.74
1084
247
baseline_visit_arm_1
0.958537
5.000000
2.000000
2.000000
1.000000
3.000000
1.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
1.000000
2
0.910256
21.000000
40.000000
2.00
1085
248
baseline_visit_arm_1
1.000000
1.000000
3.000000
2.000000
1.000000
3.000000
2.000000
0.000000
…
0.000000
0.000000
0.000000
0.000000
9.000000
2
1.000000
13.000000
5.000000
2.00
1086
248
6_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
1.74
1087
248
12_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
0.00
1088
248
24_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
0.00
1089
248
30_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
1.74
1090
249
baseline_visit_arm_1
0.958537
2.000000
2.000000
2.000000
0.000000
2.000000
1.000000
0.000000
…
0.000000
0.000000
0.000000
0.000000
3.000000
2
0.910256
29.000000
38.000000
2.00
1091
250
baseline_visit_arm_1
0.958537
5.000000
3.000000
2.000000
1.000000
3.000000
2.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
0.000000
2
0.910256
29.000000
34.000000
2.00
1092
251
baseline_visit_arm_1
0.958537
4.000000
3.000000
2.000000
1.000000
3.000000
0.000000
1.000000
…
0.000000
0.000000
1.000000
0.000000
7.000000
2
0.910256
30.000000
69.000000
2.00
1093
253
baseline_visit_arm_1
1.000000
2.000000
3.000000
2.000000
1.000000
3.000000
2.000000
1.000000
…
2.000000
2.000000
0.000000
0.000000
18.000000
2
1.000000
36.000000
68.000000
2.00
1094
253
6_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
1.74
1095
253
12_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
0.00
1096
253
24_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
0.00
1097
253
30_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
1.74
1098
254
baseline_visit_arm_1
1.000000
4.000000
1.000000
1.000000
0.000000
1.000000
0.000000
0.000000
…
2.000000
1.000000
1.000000
0.000000
10.000000
2
1.000000
9.000000
0.000000
2.00
1099
254
6_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
1.74
1100
254
12_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
0.00
1101
254
24_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
0.00
1102
254
30_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
1.74
1103
255
baseline_visit_arm_1
0.958537
3.000000
2.000000
2.000000
1.000000
3.000000
2.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
2.000000
2
0.910256
13.000000
8.000000
2.00
1104
256
baseline_visit_arm_1
0.000000
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
2.000000
1.000000
0.000000
0.000000
12.000000
2
1.000000
37.000000
66.000000
2.00
1105
256
6_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
1.74
1106
256
12_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
0.00
1107
256
24_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
0.00
1108
256
30_month_followup_arm_1
0.958537
3.901345
2.843049
1.865672
0.943284
2.732836
1.559701
0.727952
…
0.205215
0.348811
0.147225
0.044268
4.378256
0
0.910256
24.673428
28.981744
1.74
1109
257
baseline_visit_arm_1
0.958537
4.000000
3.000000
2.000000
1.000000
3.000000
2.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
1.000000
2
0.910256
31.000000
35.000000
2.00
1110
259
baseline_visit_arm_1
0.958537
3.000000
2.000000
2.000000
1.000000
2.000000
2.000000
1.000000
…
0.000000
0.000000
0.000000
0.000000
4.000000
2
0.910256
30.000000
65.000000
2.00
1111
260
baseline_visit_arm_1
1.000000
2.000000
3.000000
2.000000
1.000000
3.000000
2.000000
0.000000
…
0.000000
0.000000
0.000000
0.000000
6.000000
2
1.000000
8.000000
0.000000
2.00
1112 rows × 135 columns
In [3]:
### Read bloodwork。csv
xl_file = pd.ExcelFile(“bloodwork.xlsx”)
dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
blookwork = dfs[‘bloodwork_result’]
### precess empty value, fill empty with mean
blookwork = blookwork.fillna(df.mean())
blookwork = blookwork.dropna()
blookwork
Out[3]:
study_id
primary_dx_pdbp
triglycerides
cholesterol
hdl
cholesterol_hdl_ratio
ldl
uric_acid
iron
total_ibc
iron_saturation
transferrin
ceruloplasmin
age_calc_yrs
months_since_base_calc
sexc
agebl
visit
0
1
2
252.0
254.0
42.0
6.0
162.0
5.2
126.0
303.0
42.0
254.0
26.0
60.7
0.0
2
60.7
1
1
2
2
80.0
188.0
52.0
4.0
120.0
5.7
90.0
353.0
25.0
263.0
34.0
54.8
0.0
2
54.8
1
2
2
2
89.0
200.0
49.0
4.0
133.0
5.6
73.0
330.0
22.0
244.0
25.0
56.3
17.9
2
54.8
2
3
2
2
70.0
244.0
62.0
4.0
168.0
6.9
82.0
363.0
23.0
265.0
25.0
57.8
35.6
2
54.8
3
4
3
2
116.0
185.0
56.0
3.0
106.0
7.1
81.0
359.0
23.0
321.0
34.0
58.6
0.0
1
58.6
1
5
3
2
76.0
173.0
56.0
3.0
102.0
5.5
138.0
389.0
35.0
317.0
31.0
59.1
5.7
1
58.6
2
6
3
2
103.0
192.0
44.0
4.0
127.0
5.4
112.0
368.0
30.0
285.0
30.0
60.6
23.8
1
58.6
3
7
4
1
59.0
127.0
49.0
3.0
66.0
3.6
18.0
236.0
8.0
132.0
33.0
68.7
0.0
1
68.7
1
8
5
1
104.0
191.0
40.0
5.0
130.0
4.2
51.0
409.0
12.0
299.0
26.0
69.4
0.0
2
69.4
1
9
5
1
108.0
199.0
34.0
6.0
143.0
4.5
69.0
392.0
18.0
252.0
31.0
70.7
15.3
2
69.4
2
10
5
1
157.0
187.0
38.0
5.0
118.0
5.3
97.0
423.0
23.0
311.0
21.0
72.2
33.5
2
69.4
3
11
6
1
54.0
151.0
45.0
3.0
95.0
5.7
63.0
341.0
18.0
230.0
35.0
68.0
0.0
2
68.0
1
12
6
1
64.0
158.0
39.0
4.0
106.0
6.6
67.0
295.0
23.0
195.0
28.0
69.5
18.7
2
68.0
2
14
7
2
75.0
190.0
86.0
2.0
89.0
5.1
56.0
378.0
15.0
314.0
44.0
53.4
0.0
1
53.4
1
15
7
2
33.0
174.0
82.0
2.0
85.0
5.4
35.0
350.0
10.0
251.0
28.0
54.9
17.9
1
53.4
2
16
7
2
35.0
200.0
101.0
2.0
92.0
5.4
63.0
339.0
19.0
264.0
27.0
56.4
36.0
1
53.4
3
17
8
2
117.0
218.0
70.0
3.0
125.0
4.3
62.0
405.0
15.0
374.0
35.0
63.0
0.0
1
63.0
1
18
9
1
103.0
114.0
32.0
4.0
61.0
8.1
113.0
307.0
37.0
234.0
25.0
77.8
0.0
2
77.8
1
19
9
1
171.0
159.0
28.0
6.0
97.0
9.2
149.0
313.0
48.0
223.0
27.0
79.4
18.2
2
77.8
2
20
10
2
183.0
189.0
47.0
4.0
105.0
4.8
85.0
364.0
23.0
314.0
29.0
77.2
0.0
1
77.2
1
21
10
2
132.0
205.0
47.0
4.0
132.0
6.0
95.0
389.0
24.0
306.0
29.0
78.7
18.2
1
77.2
2
22
11
3
126.0
223.0
40.0
6.0
158.0
5.3
134.0
256.0
52.0
192.0
27.0
90.9
0.0
2
90.9
1
23
12
2
69.0
194.0
62.0
3.0
118.0
5.3
62.0
289.0
21.0
252.0
25.0
52.2
0.0
1
52.2
1
24
12
2
84.0
204.0
56.0
4.0
131.0
6.2
69.0
330.0
21.0
246.0
27.0
53.7
17.9
1
52.2
2
25
12
2
110.0
218.0
57.0
4.0
139.0
6.5
91.0
334.0
27.0
274.0
26.0
55.2
36.0
1
52.2
3
26
13
1
50.0
172.0
52.0
3.0
110.0
6.5
134.0
297.0
45.0
233.0
24.0
73.6
0.0
2
73.6
1
27
13
1
81.0
152.0
42.0
4.0
94.0
5.9
123.0
299.0
41.0
213.0
26.0
75.1
17.7
2
73.6
2
28
13
1
149.0
170.0
41.0
4.0
99.0
6.7
146.0
301.0
49.0
216.0
23.0
76.6
36.1
2
73.6
3
29
14
2
139.0
173.0
67.0
3.0
78.0
6.0
119.0
349.0
34.0
300.0
27.0
69.6
0.0
1
69.6
1
30
14
2
98.0
157.0
78.0
2.0
59.0
6.6
144.0
334.0
43.0
263.0
24.0
71.1
17.7
1
69.6
2
…
…
…
…
…
…
…
…
…
…
…
…
…
…
…
…
…
…
…
460
238
2
57.0
136.0
53.0
3.0
72.0
4.8
44.0
306.0
14.0
243.0
19.0
80.5
0.0
2
80.5
1
461
239
2
98.0
226.0
106.0
2.0
100.0
3.6
78.0
316.0
25.0
294.0
89.0
77.3
0.0
1
77.3
1
462
240
2
145.0
191.0
39.0
5.0
123.0
5.3
108.0
263.0
41.0
185.0
28.0
85.2
0.0
1
85.2
1
463
241
5
312.0
215.0
36.0
6.0
117.0
5.8
79.0
378.0
21.0
299.0
21.0
83.8
0.0
1
83.8
1
464
242
2
72.0
182.0
62.0
3.0
106.0
3.7
85.0
300.0
28.0
222.0
28.0
80.0
0.0
1
80.0
1
465
243
4
120.0
184.0
47.0
4.0
113.0
3.8
43.0
429.0
10.0
334.0
84.0
53.0
0.0
1
53.0
1
466
244
2
221.0
137.0
40.0
3.0
53.0
6.5
93.0
254.0
37.0
248.0
21.0
86.2
0.0
2
86.2
1
467
245
2
73.0
221.0
63.0
4.0
143.0
5.1
103.0
3.9
33.0
241.0
29.0
75.0
0.0
1
75.0
1
468
246
3
122.0
304.0
72.0
4.0
208.0
3.3
79.0
325.0
24.0
272.0
39.0
72.0
0.0
1
72.0
1
469
247
2
77.0
182.0
54.0
3.0
113.0
6.6
89.0
359.0
25.0
261.0
29.0
0.0
0.0
2
0.0
1
470
248
4
127.0
172.0
31.0
6.0
116.0
8.4
85.0
300.0
28.0
238.0
29.0
74.9
0.0
2
74.9
1
471
249
2
217.0
198.0
34.0
6.0
121.0
5.9
63.0
383.0
16.0
297.0
26.0
77.4
0.0
1
77.4
1
472
250
2
165.0
147.0
44.0
3.0
70.0
6.8
237.0
203.0
117.0
171.0
25.0
77.9
0.0
2
77.9
1
473
251
2
265.0
296.0
38.0
8.0
205.0
4.1
77.0
304.0
25.0
232.0
30.0
81.6
0.0
1
81.6
1
474
252
4
71.0
173.0
43.0
4.0
116.0
4.1
103.0
331.0
31.0
228.0
19.0
63.1
0.0
2
63.1
1
475
254
5
77.0
160.0
47.0
3.0
98.0
5.1
63.0
306.0
21.0
223.0
22.0
84.3
0.0
2
84.3
1
476
255
2
85.0
198.0
53.0
4.0
128.0
3.7
71.0
256.0
28.0
185.0
21.0
77.5
0.0
1
77.5
1
477
256
4
58.0
182.0
65.0
3.0
105.0
6.3
57.0
362.0
16.0
284.0
23.0
64.9
0.0
1
64.9
1
478
257
2
68.0
183.0
51.0
4.0
118.0
4.6
120.0
263.0
46.0
191.0
25.0
79.3
0.0
2
79.3
1
479
258
4
87.0
130.0
45.0
3.0
68.0
7.7
63.0
281.0
22.0
210.0
26.0
80.1
0.0
2
80.1
1
480
259
2
65.0
128.0
56.0
2.0
59.0
5.4
111.0
259.0
43.0
221.0
24.0
84.4
0.0
1
84.4
1
481
260
4
67.0
227.0
74.0
3.0
140.0
6.6
198.0
286.0
69.0
235.0
23.0
74.6
0.0
2
74.6
1
482
261
4
108.0
128.0
55.0
2.0
51.0
4.3
86.0
298.0
29.0
263.0
35.0
71.0
0.0
1
71.0
1
483
262
5
93.0
153.0
90.0
2.0
44.0
4.3
96.0
366.0
26.0
327.0
20.0
77.3
0.0
1
77.3
1
484
263
4
86.0
144.0
76.0
2.0
51.0
3.7
83.0
244.0
34.0
213.0
21.0
84.1
0.0
1
84.1
1
485
264
3
88.0
250.0
97.0
3.0
135.0
4.1
105.0
309.0
34.0
245.0
24.0
77.3
0.0
1
77.3
1
486
265
4
242.0
279.0
45.0
6.0
186.0
5.2
101.0
318.0
32.0
258.0
30.0
54.3
0.0
1
54.3
1
487
267
3
178.0
187.0
42.0
4.0
109.0
6.4
95.0
283.0
34.0
222.0
21.0
81.0
0.0
2
81.0
1
488
268
2
96.0
221.0
72.0
3.0
130.0
4.2
79.0
383.0
21.0
311.0
27.0
77.6
0.0
1
77.6
1
489
269
5
131.0
191.0
51.0
4.0
114.0
4.1
117.0
281.0
42.0
252.0
25.0
60.8
0.0
1
60.8
1
465 rows × 18 columns
In [4]:
### combine data from the two files
ds = []
for i in range(1, 270):
x = df.loc[df[‘study_id’] == i]
y = blookwork.loc[blookwork[‘study_id’] == i]
if x.shape[0] == 0 and y.shape[0] == 0:
continue
if x.shape[0] == 0:
x = df.mean()
else:
x = x.mean()
x[“study_id”] = i
if y.shape[0] == 0:
y = blookwork.mean()
else:
y = y.mean()
y[“study_id”] = i
del x[“study_id”]
ds.append((y, x))
mdata = np.vstack([np.hstack(ds[i]) for i in range(len(ds))])
In [5]:
fm = mdata[:, 1:]
### normalize the data
fmNorm = preprocessing.normalize(fm, axis=0)
from sklearn.decomposition import PCA
### PCA reduction to reduce the dimension of feature
pca = PCA(n_components=20)
reductedFm = pca.fit_transform(fmNorm)
## kmeans to cluster
## reference: http://scikit-learn.org/stable/modules/clustering.html#k-means
kmeans = KMeans(n_clusters=4)
kmeans.fit(reductedFm)
## print the cluster result
print(kmeans.labels_)
[2 2 2 0 0 3 2 2 3 2 0 2 3 2 2 3 3 0 0 0 3 3 0 3 2 3 3 2 0 2 2 2 2 3 2 3 2
1 2 3 0 2 3 2 3 3 2 3 2 3 3 3 0 3 2 0 3 0 3 0 3 2 2 3 2 2 2 3 2 3 2 3 2 3
3 2 3 3 3 0 3 2 2 2 3 2 0 2 2 3 0 2 3 0 2 0 2 3 3 0 3 2 3 0 0 3 2 3 3 2 2
3 1 0 2 3 0 2 3 2 2 0 3 2 3 2 2 2 2 1 3 0 2 2 0 2 0 2 2 2 2 3 3 0 2 0 2 3
2 3 0 0 2 2 2 3 0 2 2 3 2 2 2 0 1 2 3 3 0 3 3 2 3 3 3 3 2 3 3 3 2 2 2 2 3
2 2 1 0 3 3 0 2 0 3 0 0 0 1 3 2 2 2 2 3 3 3 2 3 3 3 2 3 3 2 2 0 2 3 3 3 3
2 2 3 2 2 3 2 3 2 2 3 2 3 2 2 2 3 3 3 2 3 2 3 2 3 3 3 3 3 3 3 3 3]
In [6]:
### save the cluster result to csv file
outFrame = pd.DataFrame({ “cluster”: kmeans.labels_, “id”: mdata[:, 0].astype(‘int’)})
outFrame.to_csv(“cluster_result.csv”)
outFrame
Out[6]:
cluster
id
0
2
1
1
2
2
2
2
3
3
0
4
4
0
5
5
3
6
6
2
7
7
2
8
8
3
9
9
2
10
10
0
11
11
2
12
12
3
13
13
2
14
14
2
15
15
3
16
16
3
17
17
0
18
18
0
19
19
0
20
20
3
21
21
3
22
22
0
23
23
3
24
24
2
25
25
3
26
26
3
27
27
2
28
28
0
29
29
2
30
…
…
…
225
2
239
226
2
240
227
3
241
228
2
242
229
3
243
230
2
244
231
2
245
232
3
246
233
2
247
234
3
248
235
2
249
236
2
250
237
2
251
238
3
252
239
3
253
240
3
254
241
2
255
242
3
256
243
2
257
244
3
258
245
2
259
246
3
260
247
3
261
248
3
262
249
3
263
250
3
264
251
3
265
252
3
267
253
3
268
254
3
269
255 rows × 2 columns
In [7]:
kmeans.cluster_centers_
Out[7]:
array([[ 3.89007849e-01, -6.60988492e-02, -6.88950056e-03,
4.69517131e-02, 1.93323509e-02, -3.58584881e-03,
5.96430270e-03, -8.96664143e-03, -1.74510429e-02,
-7.72496572e-03, 2.15901116e-02, -1.02757570e-02,
-8.19200865e-03, -4.00997047e-03, -6.63137215e-03,
-4.46568672e-03, -1.51850714e-02, 4.53734813e-03,
-1.28506507e-02, 9.68827234e-04],
[ 1.12929143e+00, 5.30429311e-02, -4.29740638e-02,
-5.78790888e-02, -2.31979358e-01, 8.92055908e-02,
2.85961522e-03, 8.33495980e-02, 4.62772867e-02,
8.84690623e-03, -3.10114633e-02, 6.38152158e-02,
3.53624668e-02, -1.46669212e-02, 4.53822874e-02,
-1.57675457e-02, 2.02681670e-02, 1.24644891e-02,
-1.71070823e-02, -3.83840768e-02],
[ -2.47642059e-01, -3.39854761e-03, -2.08627391e-02,
1.32195904e-02, -3.17455907e-02, 1.40101721e-02,
1.16900025e-02, 7.50627769e-03, 2.13617800e-03,
5.50384941e-03, 2.97987556e-03, 3.40722247e-04,
1.19256831e-03, -4.67233224e-03, 6.98129571e-03,
2.26311143e-03, -2.69882657e-03, -1.21013327e-04,
-4.69643155e-03, -2.36487464e-03],
[ 3.18188918e-02, 2.66858518e-02, 2.65136069e-02,
-2.87942009e-02, 3.81799688e-02, -1.80512798e-02,
-1.44577123e-02, -8.93810143e-03, 2.07311017e-03,
-3.05108768e-03, -9.82536631e-03, 2.56204428e-05,
-1.47778510e-05, 7.21364274e-03, -7.12081084e-03,
3.89045918e-04, 7.61510404e-03, -2.40885251e-03,
1.08994610e-02, 4.26111050e-03]])
In [9]:
kmeans.inertia_
Out[9]:
26.90743998005269
In [ ]: