cod
In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import neighbors, datasets, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
In [2]:
training = pd.read_csv(“training.csv”)
In [3]:
training.head()
Out[3]:
ID CNNs CNNs.1 CNNs.2 CNNs.3 CNNs.4 CNNs.5 CNNs.6 CNNs.7 CNNs.8 … GIST.503 GIST.504 GIST.505 GIST.506 GIST.507 GIST.508 GIST.509 GIST.510 GIST.511 prediction
0 1 0.44918 0.00000 0.000000 0.00000 0.0000 0.11052 1.317000 0.0000 1.43430 … 0.005435 0.002507 0.037413 0.047131 0.004175 0.003744 0.037672 0.071448 0.013265 1
1 2 0.00000 0.00000 0.000000 0.00000 0.0000 0.63184 0.000000 1.8388 0.49590 … 0.033147 0.008112 0.004126 0.014677 0.048980 0.011394 0.012629 0.033668 0.048248 1
2 3 0.00000 0.00000 0.000000 0.28339 0.0000 0.74949 0.062996 1.6002 0.74090 … 0.023006 0.025229 0.043951 0.032144 0.018992 0.033419 0.030518 0.022346 0.014875 1
3 4 0.31198 0.24452 0.212100 0.97855 0.0000 1.31980 0.000000 0.0000 0.00000 … 0.033687 0.066910 0.036916 0.029357 0.017351 0.020543 0.015300 0.016477 0.019715 0
4 5 0.00000 0.28560 0.067485 0.00000 0.4414 0.00000 0.539600 0.0000 0.09162 … 0.014306 0.023978 0.019834 0.029528 0.029826 0.027222 0.032496 0.026370 0.028569 0
5 rows × 4610 columns
In [4]:
additional_training = pd.read_csv(“additional_training.csv”)
In [5]:
additional_training.head()
Out[5]:
ID CNNs CNNs.1 CNNs.2 CNNs.3 CNNs.4 CNNs.5 CNNs.6 CNNs.7 CNNs.8 … GIST.503 GIST.504 GIST.505 GIST.506 GIST.507 GIST.508 GIST.509 GIST.510 GIST.511 prediction
0 457 NaN 1.72630 NaN 0.00000 0.00000 0.79900 0.00000 0.00000 0.00000 … NaN 0.022648 0.043312 0.025739 0.028180 0.032245 0.036754 0.029104 NaN 1
1 458 0.83494 0.00000 0.87363 0.18423 NaN 0.35443 1.25750 0.84533 0.84820 … 0.005451 NaN 0.016365 0.014265 NaN 0.026452 NaN 0.053096 0.057367 0
2 459 NaN NaN 0.00000 0.00000 NaN 0.00000 NaN 0.71016 0.54720 … 0.008683 NaN 0.015751 0.005463 0.005901 0.018561 0.025340 0.005856 0.003299 1
3 460 NaN 0.58965 NaN 0.00000 0.25197 0.00000 0.80329 0.00000 0.23879 … 0.028226 0.004367 0.013654 0.036700 0.029672 0.009894 0.005218 0.037673 0.031507 1
4 461 0.63241 1.09670 0.00000 0.00000 0.00000 NaN 0.12193 0.00000 0.47860 … 0.009443 0.017366 0.037620 NaN 0.007043 0.033402 NaN 0.046515 0.011995 1
5 rows × 4610 columns
In [6]:
annotation_confidence = pd.read_csv(“annotation_confidence.csv”)
In [7]:
annotation_confidence.head()
Out[7]:
ID confidence
0 1 1.00
1 2 0.66
2 3 1.00
3 4 0.66
4 5 1.00
In [8]:
annotation_confidence.shape
Out[8]:
(4560, 2)
In [9]:
confidence = annotation_confidence.drop(“ID”, axis = 1)[“confidence”]
In [10]:
testing = pd.read_csv(“testing.csv”)
In [11]:
testing.head()
Out[11]:
ID CNNs CNNs.1 CNNs.2 CNNs.3 CNNs.4 CNNs.5 CNNs.6 CNNs.7 CNNs.8 … GIST.502 GIST.503 GIST.504 GIST.505 GIST.506 GIST.507 GIST.508 GIST.509 GIST.510 GIST.511
0 1 0.39450 0.00000 0.0 0.0 0.71368 0.000000 0.0 0.088419 0.00000 … 0.046335 0.059068 0.015266 0.026326 0.069981 0.082463 0.016850 0.026402 0.057077 0.018346
1 2 0.00000 1.51140 0.0 0.0 0.00000 0.000000 0.0 0.000000 0.27181 … 0.039631 0.021535 0.001565 0.018167 0.035155 0.010637 0.009889 0.015632 0.056447 0.021068
2 3 0.00000 0.00000 0.0 0.0 0.00000 0.000000 0.0 0.405780 0.00000 … 0.047777 0.015708 0.005371 0.005871 0.026658 0.018925 0.013183 0.021818 0.039099 0.029165
3 4 0.00000 0.00000 0.0 0.0 0.00000 0.621900 0.0 0.730720 0.00000 … 0.025663 0.019080 0.045263 0.029552 0.032089 0.027852 0.033438 0.019483 0.025319 0.047285
4 5 0.30823 0.27191 0.0 0.0 0.42885 0.096512 0.0 0.000000 0.00000 … 0.088133 0.026220 0.041945 0.035237 0.017721 0.020710 0.011591 0.028428 0.011770 0.020870
5 rows × 4609 columns
In [12]:
all_train = pd.concat([training, additional_training])
In [13]:
import numpy as np
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=’NaN’, strategy=’mean’, axis=0)
t = all_train.drop([“ID”, “prediction”], axis=1)
all_train_rm = imp.fit(t).transform(t)
In [14]:
all_train_rm.shape
Out[14]:
(4560, 4608)
In [15]:
all_train.head()
Out[15]:
ID CNNs CNNs.1 CNNs.2 CNNs.3 CNNs.4 CNNs.5 CNNs.6 CNNs.7 CNNs.8 … GIST.503 GIST.504 GIST.505 GIST.506 GIST.507 GIST.508 GIST.509 GIST.510 GIST.511 prediction
0 1 0.44918 0.00000 0.000000 0.00000 0.0000 0.11052 1.317000 0.0000 1.43430 … 0.005435 0.002507 0.037413 0.047131 0.004175 0.003744 0.037672 0.071448 0.013265 1
1 2 0.00000 0.00000 0.000000 0.00000 0.0000 0.63184 0.000000 1.8388 0.49590 … 0.033147 0.008112 0.004126 0.014677 0.048980 0.011394 0.012629 0.033668 0.048248 1
2 3 0.00000 0.00000 0.000000 0.28339 0.0000 0.74949 0.062996 1.6002 0.74090 … 0.023006 0.025229 0.043951 0.032144 0.018992 0.033419 0.030518 0.022346 0.014875 1
3 4 0.31198 0.24452 0.212100 0.97855 0.0000 1.31980 0.000000 0.0000 0.00000 … 0.033687 0.066910 0.036916 0.029357 0.017351 0.020543 0.015300 0.016477 0.019715 0
4 5 0.00000 0.28560 0.067485 0.00000 0.4414 0.00000 0.539600 0.0000 0.09162 … 0.014306 0.023978 0.019834 0.029528 0.029826 0.027222 0.032496 0.026370 0.028569 0
5 rows × 4610 columns
In [16]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
In [ ]:
In [17]:
model = linear_model.LogisticRegression(C=0.001)
model.fit(all_train_rm, all_train[“prediction”])
testing = pd.read_csv(“testing.csv”)
test_d = testing.drop(“ID”, axis=1)
test_pred = model.predict(test_d)
output = testing[[“ID”]].copy()
output[“prediction”] = test_pred
#= test_pred
#output = testing[[“ID”, “prediction”]]
output.to_csv(“submission.csv”, index = False)
In [18]:
model = linear_model.LogisticRegression(C=0.001)
model.fit(all_train_rm, all_train[“prediction”], sample_weight=confidence)
testing = pd.read_csv(“testing.csv”)
test_d = testing.drop(“ID”, axis=1)
test_pred = model.predict(test_d)
output = testing[[“ID”]].copy()
output[“prediction”] = test_pred
#= test_pred
#output = testing[[“ID”, “prediction”]]
output.to_csv(“submission2.csv”, index = False)
In [19]:
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(C = 0.001, max_iter = 1000, class_weight= ‘balance’, solver = ‘sag’)
pca = PCA(n_components=150)
scaler = preprocessing.StandardScaler()
m = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
m.fit(all_train_rm, all_train[“prediction”], lr__sample_weight=confidence)
testing = pd.read_csv(“testing.csv”)
test_d = testing.drop(“ID”, axis=1)
test_pred = m.predict(test_d)
output = testing[[“ID”]].copy()
output[“prediction”] = test_pred
output.to_csv(“submission3.csv”, index = False)
In [20]:
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(C = 0.001, max_iter = 1000, class_weight= ‘balance’, solver = ‘sag’)
pca = PCA(n_components=150)
scaler = preprocessing.StandardScaler()
m = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
m.fit(all_train_rm, all_train[“prediction”], lr__sample_weight=nconf)
testing = pd.read_csv(“testing.csv”)
test_d = testing.drop(“ID”, axis=1)
test_pred = m.predict(test_d)
output = testing[[“ID”]].copy()
output[“prediction”] = test_pred
output.to_csv(“submission4.csv”, index = False)
—————————————————————————
NameError Traceback (most recent call last)
7 (‘lr’, lr)])
8
—-> 9 m.fit(all_train_rm, all_train[“prediction”], lr__sample_weight=nconf)
10 testing = pd.read_csv(“testing.csv”)
11 test_d = testing.drop(“ID”, axis=1)
NameError: name ‘nconf’ is not defined
In [21]:
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(C = 0.001, max_iter = 1000, class_weight= ‘balance’, solver = ‘sag’)
pca = PCA(n_components=150)
scaler = preprocessing.StandardScaler()
bagging = BaggingClassifier(lr, n_estimators=20)
m = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘bag’, bagging)])
m.fit(all_train_rm, all_train[“prediction”], bag__sample_weight = confidence)
testing = pd.read_csv(“testing.csv”)
test_d = testing.drop(“ID”, axis=1)
test_pred = m.predict(test_d)
output = testing[[“ID”]].copy()
output[“prediction”] = test_pred
output.to_csv(“submission5.csv”, index = False)
—————————————————————————
NameError Traceback (most recent call last)
3 pca = PCA(n_components=150)
4 scaler = preprocessing.StandardScaler()
—-> 5 bagging = BaggingClassifier(lr, n_estimators=20)
6
7 m = Pipeline([(‘scale’, scaler),
NameError: name ‘BaggingClassifier’ is not defined
In [22]:
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(C = 0.001, max_iter = 1000, class_weight= ‘balance’, solver = ‘sag’)
pca = PCA(n_components=150)
scaler = preprocessing.StandardScaler()
bagging = BaggingClassifier(lr, n_estimators=20)
m = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘bag’, bagging)])
m.fit(all_train_rm, all_train[“prediction”], bag__sample_weight = nconf)
testing = pd.read_csv(“testing.csv”)
test_d = testing.drop(“ID”, axis=1)
test_pred = m.predict(test_d)
output = testing[[“ID”]].copy()
output[“prediction”] = test_pred
output.to_csv(“submission6.csv”, index = False)
—————————————————————————
NameError Traceback (most recent call last)
3 pca = PCA(n_components=150)
4 scaler = preprocessing.StandardScaler()
—-> 5 bagging = BaggingClassifier(lr, n_estimators=20)
6
7 m = Pipeline([(‘scale’, scaler),
NameError: name ‘BaggingClassifier’ is not defined
In [23]:
X = all_train_rm
y = all_train[“prediction”].as_matrix()
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, random_state=None, shuffle=False)
scores = []
for train_index, test_index in skf.split(X, y):
print(“TRAIN:”, train_index, “TEST:”, test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
m = linear_model.LogisticRegression(C=0.001)
m.fit(X_train, y_train)
score = m.score(X_test, y_test)
scores.append(score)
print(scores)
print(np.mean(scores))
print(np.std(scores))
TRAIN: [1115 1118 1120 …, 4557 4558 4559] TEST: [ 0 1 2 …, 1155 1157 1158]
TRAIN: [ 0 1 2 …, 4557 4558 4559] TEST: [1115 1118 1120 …, 2286 2287 2290]
TRAIN: [ 0 1 2 …, 4557 4558 4559] TEST: [2266 2267 2268 …, 3431 3433 3436]
TRAIN: [ 0 1 2 …, 3431 3433 3436] TEST: [3408 3409 3410 …, 4557 4558 4559]
[0.81403508771929822, 0.82631578947368423, 0.82280701754385965, 0.82192982456140351]
0.821271929825
0.00448892313387
In [ ]:
In [ ]:
from sklearn.model_selection import GridSearchCV
parameters = {‘C’:[0.1, 1, 10, 100, 1000]}
lr = linear_model.LogisticRegression()
gcv = GridSearchCV( lr, parameters)
gcv.fit(all_train_rm, all_train[“prediction”])
gcv.cv_results_
In [ ]:
In [ ]:
parameters = {‘C’:[0.01, 0.05, 0.1]}
lr = linear_model.LogisticRegression()
gcv = GridSearchCV( lr, parameters)
gcv.fit(all_train_rm, all_train[“prediction”])
gcv.cv_results_
In [ ]:
parameters = {‘C’:[0.0001, 0.001]}
lr = linear_model.LogisticRegression()
gcv = GridSearchCV( lr, parameters)
gcv.fit(all_train_rm, all_train[“prediction”])
gcv.cv_results_
In [ ]:
parameters = {‘C’:[0.0005, 0.001, 0.005]}
lr = linear_model.LogisticRegression()
gcv = GridSearchCV( lr, parameters)
gcv.fit(all_train_rm, all_train[“prediction”])
gcv.cv_results_
In [ ]:
annotation_confidence.head()
In [ ]:
confidence[[1,2]]
In [ ]:
# parameters = {‘C’:[0.001]}
# lr = linear_model.LogisticRegression()
# gcv = GridSearchCV( lr, parameters)
# gcv.fit(all_train_rm, all_train[“prediction”])
# gcv.cv_results_
X = all_train_rm
y = all_train[“prediction”].as_matrix()
S = annotation_confidence.drop(“ID”, axis = 1)[“confidence”]
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, random_state=None, shuffle=False)
scores = []
for train_index, test_index in skf.split(X, y):
print(“TRAIN:”, train_index, “TEST:”, test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
m = linear_model.LogisticRegression(C=0.001)
m.fit(X_train, y_train, sample_weight = S[train_index])
score = m.score(X_test, y_test, sample_weight = S[test_index])
scores.append(score)
print(scores)
print(np.mean(scores))
print(np.std(scores))
In [ ]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.pipeline import Pipeline
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression()
from sklearn.decomposition import PCA
pca = PCA()
pipeline = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
#all_train_rm_scale = scaler.transform(all_train_rm)
# parameters = {‘lr__C’:[0.0005, 0.001, 0.005]}
parameters = {‘lr__C’:[0.0001, 0.001, 0.01], ‘pca__n_components’ : [10, 100, 200, 300]}
gcv = GridSearchCV( pipeline, parameters)
gcv.fit(all_train_rm, all_train[“prediction”])
gcv.cv_results_
In [ ]:
pipeline.get_params().keys()
In [24]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
scaler = preprocessing.StandardScaler()
#lr = linear_model.LogisticRegression()
svc = SVC()
from sklearn.decomposition import PCA
pca = PCA(n_components=200)
pipeline = Pipeline([(‘scale’, scaler),
# (‘pca’, pca),
(‘svc’, svc)])
#all_train_rm_scale = scaler.transform(all_train_rm)
# parameters = {‘lr__C’:[0.0005, 0.001, 0.005]}
parameters = {‘svc__C’:[0.1]}
gcv = GridSearchCV( pipeline, parameters)
gcv.fit(all_train_rm, all_train[“prediction”])
gcv.cv_results_
—————————————————————————
KeyboardInterrupt Traceback (most recent call last)
19
20 gcv = GridSearchCV( pipeline, parameters)
—> 21 gcv.fit(all_train_rm, all_train[“prediction”])
22 gcv.cv_results_
//anaconda/lib/python3.5/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
637 error_score=self.error_score)
638 for parameters, (train, test) in product(candidate_params,
–> 639 cv.split(X, y, groups)))
640
641 # if one choose to see train score, “out” will contain train score info
//anaconda/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
–> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
//anaconda/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
–> 625 self._dispatch(tasks)
626 return True
627
//anaconda/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
–> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
//anaconda/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 “””Schedule a func to be run”””
–> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
//anaconda/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don’t delay the application, to avoid keeping the input
331 # arguments in memory
–> 332 self.results = batch()
333
334 def get(self):
//anaconda/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
–> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
//anaconda/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in
129
130 def __call__(self):
–> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
//anaconda/lib/python3.5/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
456 estimator.fit(X_train, **fit_params)
457 else:
–> 458 estimator.fit(X_train, y_train, **fit_params)
459
460 except Exception as e:
//anaconda/lib/python3.5/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
248 Xt, fit_params = self._fit(X, y, **fit_params)
249 if self._final_estimator is not None:
–> 250 self._final_estimator.fit(Xt, y, **fit_params)
251 return self
252
//anaconda/lib/python3.5/site-packages/sklearn/svm/base.py in fit(self, X, y, sample_weight)
185
186 seed = rnd.randint(np.iinfo(‘i’).max)
–> 187 fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
188 # see comment on the other call to np.iinfo in this file
189
//anaconda/lib/python3.5/site-packages/sklearn/svm/base.py in _dense_fit(self, X, y, sample_weight, solver_type, kernel, random_seed)
252 cache_size=self.cache_size, coef0=self.coef0,
253 gamma=self._gamma, epsilon=self.epsilon,
–> 254 max_iter=self.max_iter, random_seed=random_seed)
255
256 self._warn_from_fit_status()
KeyboardInterrupt:
In [ ]:
from sklearn.ensemble import RandomForestClassifier
pca = PCA(n_components=30)
scaler = preprocessing.StandardScaler()
rf = RandomForestClassifier(n_estimators=10, min_samples_leaf = 20)
# pipeline = Pipeline([(‘scale’, scaler),
# (‘pca’, pca),
# (‘rf’, rf)])
pipeline = Pipeline([
(‘pca’, pca),
(‘rf’, rf)])
parameters = {}
gcv = GridSearchCV( pipeline, parameters)
gcv.fit(all_train_rm, all_train[“prediction”])
gcv.cv_results_
In [ ]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression()
pca = PCA()
pipeline = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
#all_train_rm_scale = scaler.transform(all_train_rm)
# parameters = {‘lr__C’:[0.0005, 0.001, 0.005]}
parameters = {‘lr__C’:[ 0.001], ‘pca__n_components’ : [150, 200, 250]}
gcv = GridSearchCV( pipeline, parameters)
gcv.fit(all_train_rm, all_train[“prediction”])
gcv.cv_results_
In [ ]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression()
pca = PCA()
pipeline = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
#all_train_rm_scale = scaler.transform(all_train_rm)
# parameters = {‘lr__C’:[0.0005, 0.001, 0.005]}
parameters = {‘lr__C’:[ 0.001], ‘pca__n_components’ : [150], ‘lr__solver’: [‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’]}
gcv = GridSearchCV( pipeline, parameters)
gcv.fit(all_train_rm, all_train[“prediction”])
gcv.cv_results_
In [ ]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(max_iter = 1000)
pca = PCA()
pipeline = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
#all_train_rm_scale = scaler.transform(all_train_rm)
# parameters = {‘lr__C’:[0.0005, 0.001, 0.005]}
parameters = {‘lr__C’:[0.0001, 0.005, 0.001], ‘pca__n_components’ : [150], ‘lr__solver’: [‘sag’]}
gcv = GridSearchCV( pipeline, parameters)
gcv.fit(all_train_rm, all_train[“prediction”])
gcv.cv_results_
In [ ]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(max_iter = 1000, class_weight= ‘balance’)
pca = PCA()
pipeline = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
#all_train_rm_scale = scaler.transform(all_train_rm)
# parameters = {‘lr__C’:[0.0005, 0.001, 0.005]}
parameters = {‘lr__C’:[0.001], ‘pca__n_components’ : [150], ‘lr__solver’: [‘sag’]}
gcv = GridSearchCV( pipeline, parameters)
gcv.fit(all_train_rm, all_train[“prediction”])
gcv.cv_results_
In [ ]:
def train_and_predict(random_state):
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(C = 0.001, max_iter = 1000, class_weight= ‘balance’, solver = ‘sag’, random_state = random_state)
pca = PCA(n_components=150)
scaler = preprocessing.StandardScaler()
m = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
m.fit(X_train, y_train, lr__sample_weight = confidence[train_index])
In [ ]:
nconf = confidence.copy()
nconf[:456] = confidence[:456] * 2
nconf
In [ ]:
X = all_train_rm
y = all_train[“prediction”].as_matrix()
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, random_state=None, shuffle=False)
scores = []
for train_index, test_index in skf.split(X, y):
print(“TRAIN:”, train_index, “TEST:”, test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# m = linear_model.LogisticRegression(C=0.001, class_weight=’balance’, solver = ‘sag’)
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(C = 0.001, max_iter = 1000, class_weight= ‘balance’, solver = ‘sag’, random_state = 0)
pca = PCA(n_components=150)
scaler = preprocessing.StandardScaler()
bagging = BaggingClassifier(lr, n_estimators=20, random_state = 0)
m = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘bag’, bagging)])
# m = Pipeline([(‘scale’, scaler),
# (‘pca’, pca),
# (‘lr’, lr)])
m.fit(X_train, y_train, bag__sample_weight = confidence[train_index])
score = m.score(X_test, y_test)
scores.append(score)
print(scores)
print(np.mean(scores))
print(np.std(scores))
In [ ]:
X = all_train_rm
y = all_train[“prediction”].as_matrix()
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, random_state=None, shuffle=False)
scores = []
for train_index, test_index in skf.split(X, y):
print(“TRAIN:”, train_index, “TEST:”, test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# m = linear_model.LogisticRegression(C=0.001, class_weight=’balance’, solver = ‘sag’)
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(C = 0.001, max_iter = 1000, class_weight= ‘balance’, solver = ‘sag’)
pca = PCA(n_components=150)
scaler = preprocessing.StandardScaler()
bagging = BaggingClassifier(lr, n_estimators=20, random_state = 9999)
m = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘bag’, bagging)])
# m = Pipeline([(‘scale’, scaler),
# (‘pca’, pca),
# (‘lr’, lr)])
m.fit(X_train, y_train, bag__sample_weight = confidence[train_index])
score = m.score(X_test, y_test)
scores.append(score)
print(scores)
print(np.mean(scores))
print(np.std(scores))
In [ ]:
X = all_train_rm
y = all_train[“prediction”].as_matrix()
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, random_state=None, shuffle=False)
scores = []
for train_index, test_index in skf.split(X, y):
print(“TRAIN:”, train_index, “TEST:”, test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# m = linear_model.LogisticRegression(C=0.001, class_weight=’balance’, solver = ‘sag’)
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(C = 0.001, max_iter = 1000, class_weight= ‘balance’, solver = ‘sag’)
pca = PCA(n_components=150)
scaler = preprocessing.StandardScaler()
bagging = BaggingClassifier(lr, n_estimators=20)
m = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘bag’, bagging)])
# m = Pipeline([(‘scale’, scaler),
# (‘pca’, pca),
# (‘lr’, lr)])
m.fit(X_train, y_train, bag__sample_weight = nconf[train_index])
score = m.score(X_test, y_test)
scores.append(score)
print(scores)
print(np.mean(scores))
print(np.std(scores))
In [ ]:
X = all_train_rm
y = all_train[“prediction”].as_matrix()
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, random_state=None, shuffle=False)
scores = []
for train_index, test_index in skf.split(X, y):
print(“TRAIN:”, train_index, “TEST:”, test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# m = linear_model.LogisticRegression(C=0.001, class_weight=’balance’, solver = ‘sag’)
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(C = 0.001, max_iter = 1000, class_weight= ‘balance’, solver = ‘sag’)
pca = PCA(n_components=150)
scaler = preprocessing.StandardScaler()
m = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
m.fit(X_train, y_train, lr__sample_weight = confidence[train_index])
score = m.score(X_test, y_test)
scores.append(score)
print(scores)
print(np.mean(scores))
print(np.std(scores))
In [ ]:
X = all_train_rm
y = all_train[“prediction”].as_matrix()
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, random_state=None, shuffle=False)
scores = []
for train_index, test_index in skf.split(X, y):
print(“TRAIN:”, train_index, “TEST:”, test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# m = linear_model.LogisticRegression(C=0.001, class_weight=’balance’, solver = ‘sag’)
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(C = 0.001, max_iter = 1000, class_weight= ‘balance’, solver = ‘sag’)
pca = PCA(n_components=150)
scaler = preprocessing.StandardScaler()
m = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
m.fit(X_train, y_train, lr__sample_weight = confidence[train_index])
score = m.score(X_test, y_test)
scores.append(score)
print(scores)
print(np.mean(scores))
print(np.std(scores))
In [ ]:
X = all_train_rm
y = all_train[“prediction”].as_matrix()
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, random_state=None, shuffle=False)
scores = []
for train_index, test_index in skf.split(X, y):
print(“TRAIN:”, train_index, “TEST:”, test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# m = linear_model.LogisticRegression(C=0.001, class_weight=’balance’, solver = ‘sag’)
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(C = 0.001, max_iter = 1000, class_weight= ‘balance’, solver = ‘sag’)
pca = PCA(n_components=150)
scaler = preprocessing.StandardScaler()
m = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
m.fit(X_train, y_train, lr__sample_weight = confidence[train_index])
score = m.score(X_test, y_test)
scores.append(score)
print(scores)
print(np.mean(scores))
print(np.std(scores))
In [ ]:
X = all_train_rm
y = all_train[“prediction”].as_matrix()
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, random_state=None, shuffle=False)
scores = []
for train_index, test_index in skf.split(X, y):
print(“TRAIN:”, train_index, “TEST:”, test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# m = linear_model.LogisticRegression(C=0.001, class_weight=’balance’, solver = ‘sag’)
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(C = 0.001, max_iter = 1000, class_weight= ‘balance’, solver = ‘sag’)
pca = PCA(n_components=150)
scaler = preprocessing.StandardScaler()
m = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
m.fit(X_train, y_train, lr__sample_weight = confidence[train_index])
score = m.score(X_test, y_test)
scores.append(score)
print(scores)
print(np.mean(scores))
print(np.std(scores))
In [ ]:
X = all_train_rm
y = all_train[“prediction”].as_matrix()
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, random_state=None, shuffle=False)
scores = []
for train_index, test_index in skf.split(X, y):
print(“TRAIN:”, train_index, “TEST:”, test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
# m = linear_model.LogisticRegression(C=0.001, class_weight=’balance’, solver = ‘sag’)
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression(max_iter = 1000, class_weight= ‘balance’, solver = ‘sag’)
pca = PCA(n_components=150)
scaler = preprocessing.StandardScaler()
m = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
m.fit(X_train, y_train, lr__sample_weight = confidence[train_index])
score = m.score(X_test, y_test)
scores.append(score)
print(scores)
print(np.mean(scores))
print(np.std(scores))
In [ ]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.ensemble import BaggingClassifier
from sklearn.pipeline import Pipeline
scaler = preprocessing.StandardScaler()
from sklearn.decomposition import PCA
pca = PCA()
# pipeline = Pipeline([(‘scale’, scaler),
# (‘pca’, pca),
# (‘lr’, lr)])
lr = linear_model.LogisticRegression(C = 0.001)
bagging = BaggingClassifier(lr,
max_samples=0.7, max_features=0.7)
pipeline = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘bag’, bagging)])
#all_train_rm_scale = scaler.transform(all_train_rm)
# parameters = {‘lr__C’:[0.0005, 0.001, 0.005]}
#parameters = {‘lr__C’:[ 0.001], ‘pca__n_components’ : [200]}
parameters = {}
gcv = GridSearchCV( pipeline, parameters)
gcv.fit(all_train_rm, all_train[“prediction”])
gcv.cv_results_
In [ ]:
from sklearn.ensemble import RandomForestClassifier
pca = PCA(n_components=30)
scaler = preprocessing.StandardScaler()
rf = RandomForestClassifier(n_estimators=10, min_samples_leaf = 20)
# pipeline = Pipeline([(‘scale’, scaler),
# (‘pca’, pca),
# (‘rf’, rf)])
pipeline = Pipeline([
(‘pca’, pca),
(‘rf’, rf)])
parameters = {}
gcv = GridSearchCV( pipeline, parameters)
gcv.fit(all_train_rm, all_train[“prediction”])
gcv.cv_results_
In [ ]:
# parameters = {‘C’:[0.001]}
# lr = linear_model.LogisticRegression()
# gcv = GridSearchCV( lr, parameters)
# gcv.fit(all_train_rm, all_train[“prediction”])
# gcv.cv_results_
from sklearn.model_selection import StratifiedKFold
X = all_train_rm
y = all_train[“prediction”].as_matrix()
S = annotation_confidence.drop(“ID”, axis = 1)[“confidence”]
skf = StratifiedKFold(n_splits=4, random_state=None, shuffle=False)
scores = []
scaler = preprocessing.StandardScaler()
lr = linear_model.LogisticRegression()
pca = PCA(n_components=200)
pipeline = Pipeline([(‘scale’, scaler),
(‘pca’, pca),
(‘lr’, lr)])
#all_train_rm_scale = scaler.transform(all_train_rm)
# parameters = {‘lr__C’:[0.0005, 0.001, 0.005]}
parameters = {‘lr__C’:[ 0.001], ‘pca__n_components’ : [200]}
for train_index, test_index in skf.split(X, y):
print(“TRAIN:”, train_index, “TEST:”, test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
m = linear_model.LogisticRegression(C=0.001)
m.fit(X_train, y_train, sample_weight = S[train_index])
score = m.score(X_test, y_test)
scores.append(score)
print(scores)
print(np.mean(scores))
print(np.std(scores))
In [ ]: