ml
In [1]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
from sklearn import preprocessing
In [2]:
### Read ParkinsonsDiseaseBio_DATA_2016-01-21_2132_MoCA_HDRS_UPDRS.csv
df = pd.read_csv(“ParkinsonsDiseaseBio_DATA_2016-01-21_2132_MoCA_HDRS_UPDRS.csv”)
### precess empty value, fill empty with mean
df = df.fillna(df.mean())
df
Out[2]:
study_id redcap_event_name moca_able moca_visuospatial_exec moca_naming moca_list_of_digits moca_list_of_letters moca_serial_7_subtraction moca_repeat moca_fluency … hdrs_genital_sx hdrs_hypochondriasis hdrs_loss_of_weight hdrs_insight hdrs_total_score_calc hamilton_depression_rating_scale_complete upsit_able upsit_score upsit_percentile university_of_pennsylvania_smell_identification_test_complete
0 1 baseline_visit_arm_1 0.958537 5.000000 3.000000 2.000000 1.000000 3.000000 2.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 1.000000 2 0.910256 33.000000 34.000000 2.00
1 1 6_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 2 0.910256 24.673428 28.981744 1.74
2 1 12_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 0.00
3 2 baseline_visit_arm_1 0.958537 5.000000 3.000000 2.000000 1.000000 3.000000 1.000000 1.000000 … 1.000000 0.000000 0.000000 0.000000 1.000000 2 0.910256 37.000000 70.000000 2.00
4 2 6_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 2 0.910256 24.673428 28.981744 1.74
5 2 12_month_followup_arm_1 0.958537 5.000000 3.000000 2.000000 1.000000 3.000000 1.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 0.000000 2 0.910256 35.000000 42.000000 2.00
6 2 18_month_followup_arm_1 0.958537 5.000000 3.000000 2.000000 1.000000 3.000000 2.000000 1.000000 … 1.000000 0.000000 0.000000 0.000000 2.000000 2 0.910256 24.673428 28.981744 1.74
7 2 24_month_followup_arm_1 0.958537 4.000000 3.000000 1.000000 1.000000 3.000000 1.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 0.000000 2 0.910256 30.000000 27.000000 2.00
8 2 30_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.000000 0.000000 0.000000 0.000000 0.000000 2 0.910256 24.673428 28.981744 1.74
9 2 36_month_followup_arm_1 0.958537 5.000000 3.000000 2.000000 1.000000 3.000000 1.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 0.000000 2 0.910256 32.000000 23.000000 2.00
10 3 baseline_visit_arm_1 0.958537 5.000000 3.000000 2.000000 1.000000 3.000000 2.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 0.000000 2 0.910256 34.000000 23.000000 2.00
11 3 6_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 2 0.910256 24.673428 28.981744 1.74
12 3 12_month_followup_arm_1 0.958537 4.000000 3.000000 2.000000 1.000000 3.000000 2.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 2.000000 2 0.910256 37.000000 51.000000 2.00
13 3 18_month_followup_arm_1 0.958537 4.000000 3.000000 2.000000 1.000000 2.000000 2.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 0.000000 2 0.910256 24.673428 28.981744 1.74
14 3 24_month_followup_arm_1 0.958537 4.000000 3.000000 2.000000 1.000000 3.000000 2.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 0.000000 2 0.910256 33.000000 18.000000 2.00
15 3 30_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.000000 0.000000 0.000000 0.000000 0.000000 2 0.910256 24.673428 28.981744 1.74
16 3 36_month_followup_arm_1 0.958537 5.000000 2.000000 2.000000 1.000000 3.000000 2.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 0.000000 2 0.910256 33.000000 21.000000 2.00
17 4 baseline_visit_arm_1 0.000000 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 2 0.000000 24.673428 28.981744 2.00
18 4 6_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 1.74
19 5 baseline_visit_arm_1 1.000000 3.000000 3.000000 2.000000 1.000000 3.000000 1.000000 1.000000 … 0.000000 3.000000 0.000000 0.000000 11.000000 2 1.000000 10.000000 0.000000 2.00
20 5 6_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 2 0.910256 24.673428 28.981744 1.74
21 5 12_month_followup_arm_1 1.000000 0.000000 2.000000 1.000000 0.000000 1.000000 1.000000 0.000000 … 0.000000 1.000000 0.000000 0.000000 9.000000 2 1.000000 16.000000 10.000000 2.00
22 5 18_month_followup_arm_1 1.000000 1.000000 3.000000 1.000000 0.000000 2.000000 2.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 4.000000 2 0.910256 24.673428 28.981744 1.74
23 5 24_month_followup_arm_1 1.000000 1.000000 2.000000 2.000000 1.000000 3.000000 1.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 8.000000 2 1.000000 15.000000 8.000000 2.00
24 5 30_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.000000 0.000000 1.000000 0.000000 11.000000 2 0.910256 24.673428 28.981744 1.74
25 5 36_month_followup_arm_1 1.000000 2.000000 3.000000 2.000000 1.000000 0.000000 0.000000 1.000000 … 0.000000 0.000000 1.000000 0.000000 9.000000 2 1.000000 12.000000 6.000000 2.00
26 6 baseline_visit_arm_1 1.000000 3.000000 3.000000 2.000000 1.000000 3.000000 1.000000 1.000000 … 1.000000 0.000000 2.000000 0.000000 10.000000 2 1.000000 14.000000 0.000000 2.00
27 6 6_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 1.74
28 6 12_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 0.00
29 6 18_month_followup_arm_1 1.000000 1.000000 3.000000 2.000000 1.000000 2.000000 2.000000 1.000000 … 2.000000 0.000000 1.000000 0.000000 7.000000 2 0.910256 24.673428 28.981744 1.74
… … … … … … … … … … … … … … … … … … … … … …
1082 246 24_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 0.00
1083 246 30_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 1.74
1084 247 baseline_visit_arm_1 0.958537 5.000000 2.000000 2.000000 1.000000 3.000000 1.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 1.000000 2 0.910256 21.000000 40.000000 2.00
1085 248 baseline_visit_arm_1 1.000000 1.000000 3.000000 2.000000 1.000000 3.000000 2.000000 0.000000 … 0.000000 0.000000 0.000000 0.000000 9.000000 2 1.000000 13.000000 5.000000 2.00
1086 248 6_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 1.74
1087 248 12_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 0.00
1088 248 24_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 0.00
1089 248 30_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 1.74
1090 249 baseline_visit_arm_1 0.958537 2.000000 2.000000 2.000000 0.000000 2.000000 1.000000 0.000000 … 0.000000 0.000000 0.000000 0.000000 3.000000 2 0.910256 29.000000 38.000000 2.00
1091 250 baseline_visit_arm_1 0.958537 5.000000 3.000000 2.000000 1.000000 3.000000 2.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 0.000000 2 0.910256 29.000000 34.000000 2.00
1092 251 baseline_visit_arm_1 0.958537 4.000000 3.000000 2.000000 1.000000 3.000000 0.000000 1.000000 … 0.000000 0.000000 1.000000 0.000000 7.000000 2 0.910256 30.000000 69.000000 2.00
1093 253 baseline_visit_arm_1 1.000000 2.000000 3.000000 2.000000 1.000000 3.000000 2.000000 1.000000 … 2.000000 2.000000 0.000000 0.000000 18.000000 2 1.000000 36.000000 68.000000 2.00
1094 253 6_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 1.74
1095 253 12_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 0.00
1096 253 24_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 0.00
1097 253 30_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 1.74
1098 254 baseline_visit_arm_1 1.000000 4.000000 1.000000 1.000000 0.000000 1.000000 0.000000 0.000000 … 2.000000 1.000000 1.000000 0.000000 10.000000 2 1.000000 9.000000 0.000000 2.00
1099 254 6_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 1.74
1100 254 12_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 0.00
1101 254 24_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 0.00
1102 254 30_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 1.74
1103 255 baseline_visit_arm_1 0.958537 3.000000 2.000000 2.000000 1.000000 3.000000 2.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 2.000000 2 0.910256 13.000000 8.000000 2.00
1104 256 baseline_visit_arm_1 0.000000 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 2.000000 1.000000 0.000000 0.000000 12.000000 2 1.000000 37.000000 66.000000 2.00
1105 256 6_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 1.74
1106 256 12_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 0.00
1107 256 24_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 0.00
1108 256 30_month_followup_arm_1 0.958537 3.901345 2.843049 1.865672 0.943284 2.732836 1.559701 0.727952 … 0.205215 0.348811 0.147225 0.044268 4.378256 0 0.910256 24.673428 28.981744 1.74
1109 257 baseline_visit_arm_1 0.958537 4.000000 3.000000 2.000000 1.000000 3.000000 2.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 1.000000 2 0.910256 31.000000 35.000000 2.00
1110 259 baseline_visit_arm_1 0.958537 3.000000 2.000000 2.000000 1.000000 2.000000 2.000000 1.000000 … 0.000000 0.000000 0.000000 0.000000 4.000000 2 0.910256 30.000000 65.000000 2.00
1111 260 baseline_visit_arm_1 1.000000 2.000000 3.000000 2.000000 1.000000 3.000000 2.000000 0.000000 … 0.000000 0.000000 0.000000 0.000000 6.000000 2 1.000000 8.000000 0.000000 2.00
1112 rows × 135 columns
In [3]:
### Read bloodwork。csv
xl_file = pd.ExcelFile(“bloodwork.xlsx”)
dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
blookwork = dfs[‘bloodwork_result’]
### precess empty value, fill empty with mean
blookwork = blookwork.fillna(df.mean())
blookwork = blookwork.dropna()
blookwork
Out[3]:
study_id primary_dx_pdbp triglycerides cholesterol hdl cholesterol_hdl_ratio ldl uric_acid iron total_ibc iron_saturation transferrin ceruloplasmin age_calc_yrs months_since_base_calc sexc agebl visit
0 1 2 252.0 254.0 42.0 6.0 162.0 5.2 126.0 303.0 42.0 254.0 26.0 60.7 0.0 2 60.7 1
1 2 2 80.0 188.0 52.0 4.0 120.0 5.7 90.0 353.0 25.0 263.0 34.0 54.8 0.0 2 54.8 1
2 2 2 89.0 200.0 49.0 4.0 133.0 5.6 73.0 330.0 22.0 244.0 25.0 56.3 17.9 2 54.8 2
3 2 2 70.0 244.0 62.0 4.0 168.0 6.9 82.0 363.0 23.0 265.0 25.0 57.8 35.6 2 54.8 3
4 3 2 116.0 185.0 56.0 3.0 106.0 7.1 81.0 359.0 23.0 321.0 34.0 58.6 0.0 1 58.6 1
5 3 2 76.0 173.0 56.0 3.0 102.0 5.5 138.0 389.0 35.0 317.0 31.0 59.1 5.7 1 58.6 2
6 3 2 103.0 192.0 44.0 4.0 127.0 5.4 112.0 368.0 30.0 285.0 30.0 60.6 23.8 1 58.6 3
7 4 1 59.0 127.0 49.0 3.0 66.0 3.6 18.0 236.0 8.0 132.0 33.0 68.7 0.0 1 68.7 1
8 5 1 104.0 191.0 40.0 5.0 130.0 4.2 51.0 409.0 12.0 299.0 26.0 69.4 0.0 2 69.4 1
9 5 1 108.0 199.0 34.0 6.0 143.0 4.5 69.0 392.0 18.0 252.0 31.0 70.7 15.3 2 69.4 2
10 5 1 157.0 187.0 38.0 5.0 118.0 5.3 97.0 423.0 23.0 311.0 21.0 72.2 33.5 2 69.4 3
11 6 1 54.0 151.0 45.0 3.0 95.0 5.7 63.0 341.0 18.0 230.0 35.0 68.0 0.0 2 68.0 1
12 6 1 64.0 158.0 39.0 4.0 106.0 6.6 67.0 295.0 23.0 195.0 28.0 69.5 18.7 2 68.0 2
14 7 2 75.0 190.0 86.0 2.0 89.0 5.1 56.0 378.0 15.0 314.0 44.0 53.4 0.0 1 53.4 1
15 7 2 33.0 174.0 82.0 2.0 85.0 5.4 35.0 350.0 10.0 251.0 28.0 54.9 17.9 1 53.4 2
16 7 2 35.0 200.0 101.0 2.0 92.0 5.4 63.0 339.0 19.0 264.0 27.0 56.4 36.0 1 53.4 3
17 8 2 117.0 218.0 70.0 3.0 125.0 4.3 62.0 405.0 15.0 374.0 35.0 63.0 0.0 1 63.0 1
18 9 1 103.0 114.0 32.0 4.0 61.0 8.1 113.0 307.0 37.0 234.0 25.0 77.8 0.0 2 77.8 1
19 9 1 171.0 159.0 28.0 6.0 97.0 9.2 149.0 313.0 48.0 223.0 27.0 79.4 18.2 2 77.8 2
20 10 2 183.0 189.0 47.0 4.0 105.0 4.8 85.0 364.0 23.0 314.0 29.0 77.2 0.0 1 77.2 1
21 10 2 132.0 205.0 47.0 4.0 132.0 6.0 95.0 389.0 24.0 306.0 29.0 78.7 18.2 1 77.2 2
22 11 3 126.0 223.0 40.0 6.0 158.0 5.3 134.0 256.0 52.0 192.0 27.0 90.9 0.0 2 90.9 1
23 12 2 69.0 194.0 62.0 3.0 118.0 5.3 62.0 289.0 21.0 252.0 25.0 52.2 0.0 1 52.2 1
24 12 2 84.0 204.0 56.0 4.0 131.0 6.2 69.0 330.0 21.0 246.0 27.0 53.7 17.9 1 52.2 2
25 12 2 110.0 218.0 57.0 4.0 139.0 6.5 91.0 334.0 27.0 274.0 26.0 55.2 36.0 1 52.2 3
26 13 1 50.0 172.0 52.0 3.0 110.0 6.5 134.0 297.0 45.0 233.0 24.0 73.6 0.0 2 73.6 1
27 13 1 81.0 152.0 42.0 4.0 94.0 5.9 123.0 299.0 41.0 213.0 26.0 75.1 17.7 2 73.6 2
28 13 1 149.0 170.0 41.0 4.0 99.0 6.7 146.0 301.0 49.0 216.0 23.0 76.6 36.1 2 73.6 3
29 14 2 139.0 173.0 67.0 3.0 78.0 6.0 119.0 349.0 34.0 300.0 27.0 69.6 0.0 1 69.6 1
30 14 2 98.0 157.0 78.0 2.0 59.0 6.6 144.0 334.0 43.0 263.0 24.0 71.1 17.7 1 69.6 2
… … … … … … … … … … … … … … … … … … …
460 238 2 57.0 136.0 53.0 3.0 72.0 4.8 44.0 306.0 14.0 243.0 19.0 80.5 0.0 2 80.5 1
461 239 2 98.0 226.0 106.0 2.0 100.0 3.6 78.0 316.0 25.0 294.0 89.0 77.3 0.0 1 77.3 1
462 240 2 145.0 191.0 39.0 5.0 123.0 5.3 108.0 263.0 41.0 185.0 28.0 85.2 0.0 1 85.2 1
463 241 5 312.0 215.0 36.0 6.0 117.0 5.8 79.0 378.0 21.0 299.0 21.0 83.8 0.0 1 83.8 1
464 242 2 72.0 182.0 62.0 3.0 106.0 3.7 85.0 300.0 28.0 222.0 28.0 80.0 0.0 1 80.0 1
465 243 4 120.0 184.0 47.0 4.0 113.0 3.8 43.0 429.0 10.0 334.0 84.0 53.0 0.0 1 53.0 1
466 244 2 221.0 137.0 40.0 3.0 53.0 6.5 93.0 254.0 37.0 248.0 21.0 86.2 0.0 2 86.2 1
467 245 2 73.0 221.0 63.0 4.0 143.0 5.1 103.0 3.9 33.0 241.0 29.0 75.0 0.0 1 75.0 1
468 246 3 122.0 304.0 72.0 4.0 208.0 3.3 79.0 325.0 24.0 272.0 39.0 72.0 0.0 1 72.0 1
469 247 2 77.0 182.0 54.0 3.0 113.0 6.6 89.0 359.0 25.0 261.0 29.0 0.0 0.0 2 0.0 1
470 248 4 127.0 172.0 31.0 6.0 116.0 8.4 85.0 300.0 28.0 238.0 29.0 74.9 0.0 2 74.9 1
471 249 2 217.0 198.0 34.0 6.0 121.0 5.9 63.0 383.0 16.0 297.0 26.0 77.4 0.0 1 77.4 1
472 250 2 165.0 147.0 44.0 3.0 70.0 6.8 237.0 203.0 117.0 171.0 25.0 77.9 0.0 2 77.9 1
473 251 2 265.0 296.0 38.0 8.0 205.0 4.1 77.0 304.0 25.0 232.0 30.0 81.6 0.0 1 81.6 1
474 252 4 71.0 173.0 43.0 4.0 116.0 4.1 103.0 331.0 31.0 228.0 19.0 63.1 0.0 2 63.1 1
475 254 5 77.0 160.0 47.0 3.0 98.0 5.1 63.0 306.0 21.0 223.0 22.0 84.3 0.0 2 84.3 1
476 255 2 85.0 198.0 53.0 4.0 128.0 3.7 71.0 256.0 28.0 185.0 21.0 77.5 0.0 1 77.5 1
477 256 4 58.0 182.0 65.0 3.0 105.0 6.3 57.0 362.0 16.0 284.0 23.0 64.9 0.0 1 64.9 1
478 257 2 68.0 183.0 51.0 4.0 118.0 4.6 120.0 263.0 46.0 191.0 25.0 79.3 0.0 2 79.3 1
479 258 4 87.0 130.0 45.0 3.0 68.0 7.7 63.0 281.0 22.0 210.0 26.0 80.1 0.0 2 80.1 1
480 259 2 65.0 128.0 56.0 2.0 59.0 5.4 111.0 259.0 43.0 221.0 24.0 84.4 0.0 1 84.4 1
481 260 4 67.0 227.0 74.0 3.0 140.0 6.6 198.0 286.0 69.0 235.0 23.0 74.6 0.0 2 74.6 1
482 261 4 108.0 128.0 55.0 2.0 51.0 4.3 86.0 298.0 29.0 263.0 35.0 71.0 0.0 1 71.0 1
483 262 5 93.0 153.0 90.0 2.0 44.0 4.3 96.0 366.0 26.0 327.0 20.0 77.3 0.0 1 77.3 1
484 263 4 86.0 144.0 76.0 2.0 51.0 3.7 83.0 244.0 34.0 213.0 21.0 84.1 0.0 1 84.1 1
485 264 3 88.0 250.0 97.0 3.0 135.0 4.1 105.0 309.0 34.0 245.0 24.0 77.3 0.0 1 77.3 1
486 265 4 242.0 279.0 45.0 6.0 186.0 5.2 101.0 318.0 32.0 258.0 30.0 54.3 0.0 1 54.3 1
487 267 3 178.0 187.0 42.0 4.0 109.0 6.4 95.0 283.0 34.0 222.0 21.0 81.0 0.0 2 81.0 1
488 268 2 96.0 221.0 72.0 3.0 130.0 4.2 79.0 383.0 21.0 311.0 27.0 77.6 0.0 1 77.6 1
489 269 5 131.0 191.0 51.0 4.0 114.0 4.1 117.0 281.0 42.0 252.0 25.0 60.8 0.0 1 60.8 1
465 rows × 18 columns
In [4]:
### combine data from the two files
ds = []
for i in range(1, 270):
x = df.loc[df[‘study_id’] == i]
y = blookwork.loc[blookwork[‘study_id’] == i]
if x.shape[0] == 0 and y.shape[0] == 0:
continue
if x.shape[0] == 0:
x = df.mean()
else:
x = x.mean()
x[“study_id”] = i
if y.shape[0] == 0:
y = blookwork.mean()
else:
y = y.mean()
y[“study_id”] = i
del x[“study_id”]
ds.append((y, x))
mdata = np.vstack([np.hstack(ds[i]) for i in range(len(ds))])
In [5]:
fm = mdata[:, 1:]
### normalize the data
fmNorm = preprocessing.normalize(fm, axis=0)
from sklearn.decomposition import PCA
### PCA reduction to reduce the dimension of feature
pca = PCA(n_components=20)
reductedFm = pca.fit_transform(fmNorm)
## kmeans to cluster
## reference: http://scikit-learn.org/stable/modules/clustering.html#k-means
kmeans = KMeans(n_clusters=4)
kmeans.fit(reductedFm)
## print the cluster result
print(kmeans.labels_)
[2 2 2 0 0 3 2 2 3 2 0 2 3 2 2 3 3 0 0 0 3 3 0 3 2 3 3 2 0 2 2 2 2 3 2 3 2
1 2 3 0 2 3 2 3 3 2 3 2 3 3 3 0 3 2 0 3 0 3 0 3 2 2 3 2 2 2 3 2 3 2 3 2 3
3 2 3 3 3 0 3 2 2 2 3 2 0 2 2 3 0 2 3 0 2 0 2 3 3 0 3 2 3 0 0 3 2 3 3 2 2
3 1 0 2 3 0 2 3 2 2 0 3 2 3 2 2 2 2 1 3 0 2 2 0 2 0 2 2 2 2 3 3 0 2 0 2 3
2 3 0 0 2 2 2 3 0 2 2 3 2 2 2 0 1 2 3 3 0 3 3 2 3 3 3 3 2 3 3 3 2 2 2 2 3
2 2 1 0 3 3 0 2 0 3 0 0 0 1 3 2 2 2 2 3 3 3 2 3 3 3 2 3 3 2 2 0 2 3 3 3 3
2 2 3 2 2 3 2 3 2 2 3 2 3 2 2 2 3 3 3 2 3 2 3 2 3 3 3 3 3 3 3 3 3]
In [6]:
### save the cluster result to csv file
outFrame = pd.DataFrame({ “cluster”: kmeans.labels_, “id”: mdata[:, 0].astype(‘int’)})
outFrame.to_csv(“cluster_result.csv”)
outFrame
Out[6]:
cluster id
0 2 1
1 2 2
2 2 3
3 0 4
4 0 5
5 3 6
6 2 7
7 2 8
8 3 9
9 2 10
10 0 11
11 2 12
12 3 13
13 2 14
14 2 15
15 3 16
16 3 17
17 0 18
18 0 19
19 0 20
20 3 21
21 3 22
22 0 23
23 3 24
24 2 25
25 3 26
26 3 27
27 2 28
28 0 29
29 2 30
… … …
225 2 239
226 2 240
227 3 241
228 2 242
229 3 243
230 2 244
231 2 245
232 3 246
233 2 247
234 3 248
235 2 249
236 2 250
237 2 251
238 3 252
239 3 253
240 3 254
241 2 255
242 3 256
243 2 257
244 3 258
245 2 259
246 3 260
247 3 261
248 3 262
249 3 263
250 3 264
251 3 265
252 3 267
253 3 268
254 3 269
255 rows × 2 columns
In [7]:
kmeans.cluster_centers_
Out[7]:
array([[ 3.89007849e-01, -6.60988492e-02, -6.88950056e-03,
4.69517131e-02, 1.93323509e-02, -3.58584881e-03,
5.96430270e-03, -8.96664143e-03, -1.74510429e-02,
-7.72496572e-03, 2.15901116e-02, -1.02757570e-02,
-8.19200865e-03, -4.00997047e-03, -6.63137215e-03,
-4.46568672e-03, -1.51850714e-02, 4.53734813e-03,
-1.28506507e-02, 9.68827234e-04],
[ 1.12929143e+00, 5.30429311e-02, -4.29740638e-02,
-5.78790888e-02, -2.31979358e-01, 8.92055908e-02,
2.85961522e-03, 8.33495980e-02, 4.62772867e-02,
8.84690623e-03, -3.10114633e-02, 6.38152158e-02,
3.53624668e-02, -1.46669212e-02, 4.53822874e-02,
-1.57675457e-02, 2.02681670e-02, 1.24644891e-02,
-1.71070823e-02, -3.83840768e-02],
[ -2.47642059e-01, -3.39854761e-03, -2.08627391e-02,
1.32195904e-02, -3.17455907e-02, 1.40101721e-02,
1.16900025e-02, 7.50627769e-03, 2.13617800e-03,
5.50384941e-03, 2.97987556e-03, 3.40722247e-04,
1.19256831e-03, -4.67233224e-03, 6.98129571e-03,
2.26311143e-03, -2.69882657e-03, -1.21013327e-04,
-4.69643155e-03, -2.36487464e-03],
[ 3.18188918e-02, 2.66858518e-02, 2.65136069e-02,
-2.87942009e-02, 3.81799688e-02, -1.80512798e-02,
-1.44577123e-02, -8.93810143e-03, 2.07311017e-03,
-3.05108768e-03, -9.82536631e-03, 2.56204428e-05,
-1.47778510e-05, 7.21364274e-03, -7.12081084e-03,
3.89045918e-04, 7.61510404e-03, -2.40885251e-03,
1.08994610e-02, 4.26111050e-03]])
In [9]:
kmeans.inertia_
Out[9]:
26.90743998005269
In [ ]: