stock-pred
In [1]:
import pandas as pd
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn import linear_model
from sklearn import svm
from sklearn.metrics import roc_curve, auc
from sklearn import preprocessing
import datetime
In [2]:
train = pd.read_csv(‘TrainingData.csv’)
test = pd.read_csv(‘ResultData.csv’)
/Users/vagrant/anaconda42/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (1,2,3,4) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
In [3]:
train
Out[3]:
Timestamp Variable142OPEN Variable142HIGH Variable142LOW Variable142LAST Variable143OPEN Variable143HIGH Variable143LOW Variable143LAST Variable144OPEN … Variable137LOW Variable137LAST_PRICE Variable139OPEN Variable139HIGH Variable139LOW Variable139LAST_PRICE Variable141OPEN Variable141HIGH Variable141LOW Variable141LAST_PRICE
0 40182.395833 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.061874 21.206487 41.041731 42.338085 41.041731 42.108253 2.133044 2.174362 2.117550 2.117550
1 40182.399306 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.206487 21.258134 42.108253 42.108253 41.881004 42.033364 2.117550 2.122715 2.117550 2.117550
2 40182.402778 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.268464 21.268464 42.033364 42.260614 41.803533 41.881004 2.122715 2.122715 2.112385 2.112385
3 40182.406250 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.268464 21.309782 42.033364 42.185725 42.033364 42.108253 2.122715 2.122715 2.099473 2.099473
4 40182.409722 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.278794 21.309782 42.108253 42.185725 42.033364 42.108253 2.099473 2.099473 2.094308 2.099473
5 40182.413194 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.309782 21.309782 42.033364 42.033364 41.881004 41.881004 2.099473 2.099473 2.089144 2.099473
6 40182.416667 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.278794 21.278794 41.803533 41.881004 41.728644 41.728644 2.089144 2.089144 2.083979 2.083979
7 40182.420139 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.309782 21.351100 41.728644 41.728644 41.651172 41.728644 2.083979 2.089144 2.083979 2.083979
8 40182.423611 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.351100 21.392418 41.728644 41.728644 41.651172 41.651172 2.083979 2.104638 2.083979 2.099473
9 40182.427083 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.330441 21.340771 41.651172 41.728644 41.498812 41.576283 2.099473 2.104638 2.099473 2.104638
10 40182.430556 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.340771 21.351100 41.576283 41.576283 41.423923 41.498812 2.104638 2.104638 2.099473 2.099473
11 40182.434028 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.351100 21.382089 41.423923 41.498812 41.423923 41.498812 2.099473 2.104638 2.099473 2.104638
12 40182.437500 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.382089 21.382089 41.423923 41.498812 41.423923 41.498812 2.107220 2.117550 2.107220 2.112385
13 40182.440972 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.382089 21.382089 41.498812 41.576283 41.498812 41.498812 2.112385 2.122715 2.112385 2.117550
14 40182.444444 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.371759 21.382089 41.498812 41.576283 41.498812 41.576283 2.117550 2.138209 2.117550 2.138209
15 40182.447917 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.371759 21.371759 41.576283 41.576283 41.498812 41.576283 2.143374 2.143374 2.133044 2.133044
16 40182.451389 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.361430 21.361430 41.576283 41.576283 41.576283 41.576283 2.133044 2.148538 2.133044 2.148538
17 40182.454861 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.361430 21.371759 41.576283 41.576283 41.498812 41.498812 2.148538 2.164033 2.143374 2.148538
18 40182.458333 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.340771 21.340771 41.498812 41.576283 41.498812 41.576283 2.148538 2.148538 2.148538 2.148538
19 40182.461806 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.320112 21.320112 41.576283 41.576283 41.576283 41.576283 2.148538 2.164033 2.148538 2.164033
20 40182.465278 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.299453 21.299453 41.576283 41.576283 41.498812 41.576283 2.164033 2.169197 2.164033 2.169197
21 40182.468750 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.268464 21.268464 41.498812 41.576283 41.498812 41.576283 2.169197 2.169197 2.169197 2.169197
22 40182.472222 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.268464 21.268464 41.423923 41.423923 41.346452 41.423923 2.169197 2.169197 2.164033 2.169197
23 40182.475694 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.185828 21.196157 41.423923 41.423923 41.346452 41.423923 2.169197 2.169197 2.169197 2.169197
24 40182.479167 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.185828 21.196157 41.423923 41.423923 41.271563 41.271563 2.169197 2.169197 2.169197 2.169197
25 40182.482639 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.165169 21.165169 41.271563 41.271563 41.194092 41.194092 2.169197 2.169197 2.164033 2.164033
26 40182.486111 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.165169 21.175498 41.194092 41.271563 41.194092 41.271563 2.164033 2.169197 2.164033 2.169197
27 40182.489583 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.154839 21.154839 41.271563 41.271563 41.271563 41.271563 2.169197 2.169197 2.169197 2.169197
28 40182.493056 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.154839 21.154839 41.271563 41.271563 41.194092 41.271563 2.169197 2.169197 2.169197 2.169197
29 40182.496528 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 21.134180 21.134180 41.271563 41.271563 41.194092 41.271563 2.169197 2.174362 2.169197 2.174362
… … … … … … … … … … … … … … … … … … … … … …
5892 40289.555556 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.662328 23.662328 47.363392 47.363392 47.285921 47.363392 3.767689 3.778019 3.762525 3.762525
5893 40289.559028 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.662328 23.693317 47.363392 47.363392 47.285921 47.363392 3.762525 3.772854 3.762525 3.772854
5894 40289.562500 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.713976 23.765623 47.363392 47.363392 47.285921 47.363392 3.772854 3.783184 3.772854 3.783184
5895 40289.565972 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.734635 23.734635 47.363392 47.515753 47.363392 47.515753 3.783184 3.783184 3.772854 3.778019
5896 40289.569444 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.734635 23.755294 47.515753 47.515753 47.438281 47.515753 3.778019 3.783184 3.772854 3.778019
5897 40289.572917 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.755294 23.817271 47.515753 47.668113 47.363392 47.668113 3.778019 3.793513 3.778019 3.783184
5898 40289.576389 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.817271 23.848259 47.668113 47.668113 47.590641 47.668113 3.783184 3.788348 3.783184 3.788348
5899 40289.579861 0 0 0 0 0.0 0.0 0.0 0.0 0.0 … 23.837930 23.848259 47.668113 47.743002 47.590641 47.743002 3.788348 3.793513 3.788348 3.793513
5900 40289.583333 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.817271 23.817271 47.743002 47.743002 47.668113 47.668113 3.793513 3.793513 3.783184 3.783184
5901 40289.586806 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.817271 23.817271 47.668113 47.743002 47.668113 47.668113 3.783184 3.783184 3.772854 3.772854
5902 40289.590278 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.796612 23.848259 47.668113 47.743002 47.668113 47.743002 3.772854 3.778019 3.767689 3.767689
5903 40289.593750 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.786282 23.806941 47.743002 47.743002 47.590641 47.590641 3.767689 3.772854 3.747030 3.747030
5904 40289.597222 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.744964 23.755294 47.668113 47.668113 47.438281 47.438281 3.747030 3.747030 3.736701 3.736701
5905 40289.600694 NaN NaN NaN NaN 0.0 0.0 0.0 0.0 NaN … 23.682987 23.682987 47.438281 47.438281 47.363392 47.363392 3.736701 3.736701 3.726371 3.731536
5906 40289.604167 0 0 0 0 0.0 0.0 0.0 0.0 NaN … 23.651999 23.662328 47.363392 47.363392 47.211032 47.285921 3.731536 3.741866 3.726371 3.741866
5907 40289.607639 NaN NaN NaN NaN NaN NaN NaN NaN 0.0 … 23.610681 23.641669 47.211032 47.285921 47.211032 47.211032 3.731536 3.731536 3.721206 3.726371
5908 40289.611111 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.641669 23.672658 47.211032 47.211032 47.133561 47.133561 3.726371 3.741866 3.726371 3.736701
5909 40289.614583 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.631340 23.713976 47.058672 47.133561 46.981200 46.981200 3.736701 3.741866 3.731536 3.741866
5910 40289.618056 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.713976 23.713976 46.981200 46.981200 46.906311 46.906311 3.741866 3.762525 3.741866 3.762525
5911 40289.621528 0 0 0 0 0.0 0.0 0.0 0.0 0.0 … 23.713976 23.786282 46.906311 46.906311 46.906311 46.906311 3.762525 3.767689 3.757360 3.767689
5912 40289.625000 NaN NaN NaN NaN 0.0 0.0 0.0 0.0 NaN … 23.755294 23.755294 47.133561 47.438281 47.133561 47.285921 3.767689 3.778019 3.757360 3.757360
5913 40289.628472 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.744964 23.765623 47.285921 47.438281 47.285921 47.438281 3.762525 3.767689 3.762525 3.767689
5914 40289.631944 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.662328 23.682987 47.438281 47.438281 47.363392 47.363392 3.767689 3.767689 3.762525 3.767689
5915 40289.635417 0 0 0 0 NaN NaN NaN NaN NaN … 23.682987 23.703646 47.438281 47.438281 47.363392 47.438281 3.767689 3.788348 3.767689 3.783184
5916 40289.638889 0 0 0 0 NaN NaN NaN NaN 0.0 … 23.693317 23.703646 47.438281 47.438281 47.363392 47.363392 3.783184 3.783184 3.762525 3.767689
5917 40289.642361 NaN NaN NaN NaN NaN NaN NaN NaN 0.0 … 23.682987 23.693317 47.363392 47.438281 47.285921 47.363392 3.767689 3.778019 3.767689 3.778019
5918 40289.645833 0 0 0 0 0.0 0.0 0.0 0.0 0.0 … 23.693317 23.744964 47.363392 47.515753 47.363392 47.438281 3.778019 3.783184 3.772854 3.783184
5919 40289.649306 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.755294 23.796612 47.515753 47.590641 47.363392 47.363392 3.788348 3.788348 3.783184 3.788348
5920 40289.652778 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.775953 23.920566 47.363392 47.438281 47.363392 47.438281 3.788348 3.793513 3.788348 3.793513
5921 40289.656250 NaN NaN NaN NaN NaN NaN NaN NaN NaN … 23.941225 23.941225 47.438281 47.438281 47.363392 47.363392 3.793513 3.793513 3.783184 3.783184
5922 rows × 610 columns
In [4]:
allExist = ~(train.isnull().any())
testAllExist = ~(test.isnull().any())
cols = []
for colname in test.columns:
if allExist[colname] and testAllExist[colname]:
cols.append(colname)
In [25]:
allExist
Out[25]:
Timestamp True
Variable142OPEN False
Variable142HIGH False
Variable142LOW False
Variable142LAST False
Variable143OPEN False
Variable143HIGH False
Variable143LOW False
Variable143LAST False
Variable144OPEN False
Variable144HIGH False
Variable144LOW False
Variable144LAST False
Variable145OPEN False
Variable145HIGH False
Variable145LOW False
Variable145LAST False
Variable146OPEN False
Variable146HIGH False
Variable146LOW False
Variable146LAST False
Variable147OPEN False
Variable147HIGH False
Variable147LOW False
Variable147LAST False
Variable148OPEN False
Variable148HIGH False
Variable148LOW False
Variable148LAST False
Variable149OPEN False
…
Variable127LOW True
Variable127LAST_PRICE True
Variable129OPEN True
Variable129HIGH True
Variable129LOW True
Variable129LAST_PRICE True
Variable130OPEN True
Variable130HIGH True
Variable130LOW True
Variable130LAST_PRICE True
Variable133OPEN True
Variable133HIGH True
Variable133LOW True
Variable133LAST_PRICE True
Variable136OPEN True
Variable136HIGH True
Variable136LOW True
Variable136LAST_PRICE True
Variable137OPEN True
Variable137HIGH True
Variable137LOW True
Variable137LAST_PRICE True
Variable139OPEN True
Variable139HIGH True
Variable139LOW True
Variable139LAST_PRICE True
Variable141OPEN True
Variable141HIGH True
Variable141LOW True
Variable141LAST_PRICE True
dtype: bool
In [26]:
testAllExist
Out[26]:
Timestamp True
Variable142OPEN False
Variable142HIGH False
Variable142LOW False
Variable142LAST False
Variable143OPEN False
Variable143HIGH False
Variable143LOW False
Variable143LAST False
Variable144OPEN False
Variable144HIGH False
Variable144LOW False
Variable144LAST False
Variable145OPEN False
Variable145HIGH False
Variable145LOW False
Variable145LAST False
Variable146OPEN False
Variable146HIGH False
Variable146LOW False
Variable146LAST False
Variable147OPEN False
Variable147HIGH False
Variable147LOW False
Variable147LAST False
Variable148OPEN False
Variable148HIGH False
Variable148LOW False
Variable148LAST False
Variable149OPEN False
…
Variable127LOW True
Variable127LAST_PRICE True
Variable129OPEN True
Variable129HIGH True
Variable129LOW True
Variable129LAST_PRICE True
Variable130OPEN True
Variable130HIGH True
Variable130LOW True
Variable130LAST_PRICE True
Variable133OPEN True
Variable133HIGH True
Variable133LOW True
Variable133LAST_PRICE True
Variable136OPEN True
Variable136HIGH True
Variable136LOW True
Variable136LAST_PRICE True
Variable137OPEN True
Variable137HIGH True
Variable137LOW True
Variable137LAST_PRICE True
Variable139OPEN True
Variable139HIGH True
Variable139LOW True
Variable139LAST_PRICE True
Variable141OPEN True
Variable141HIGH True
Variable141LOW True
Variable141LAST_PRICE True
dtype: bool
In [5]:
cols
Out[5]:
[‘Timestamp’,
‘Variable159OPEN’,
‘Variable159HIGH’,
‘Variable159LOW’,
‘Variable159LAST’,
‘Variable164OPEN’,
‘Variable164HIGH’,
‘Variable164LOW’,
‘Variable164LAST’,
‘Variable8OPEN’,
‘Variable8HIGH’,
‘Variable8LOW’,
‘Variable8LAST_PRICE’,
‘Variable9OPEN’,
‘Variable9HIGH’,
‘Variable9LOW’,
‘Variable9LAST_PRICE’,
‘Variable10OPEN’,
‘Variable10HIGH’,
‘Variable10LOW’,
‘Variable10LAST_PRICE’,
‘Variable11OPEN’,
‘Variable11HIGH’,
‘Variable11LOW’,
‘Variable11LAST_PRICE’,
‘Variable12OPEN’,
‘Variable12HIGH’,
‘Variable12LOW’,
‘Variable12LAST_PRICE’,
‘Variable13OPEN’,
‘Variable13HIGH’,
‘Variable13LOW’,
‘Variable13LAST_PRICE’,
‘Variable14OPEN’,
‘Variable14HIGH’,
‘Variable14LOW’,
‘Variable14LAST_PRICE’,
‘Variable15OPEN’,
‘Variable15HIGH’,
‘Variable15LOW’,
‘Variable15LAST_PRICE’,
‘Variable17OPEN’,
‘Variable17HIGH’,
‘Variable17LOW’,
‘Variable17LAST_PRICE’,
‘Variable18OPEN’,
‘Variable18HIGH’,
‘Variable18LOW’,
‘Variable18LAST_PRICE’,
‘Variable19OPEN’,
‘Variable19HIGH’,
‘Variable19LOW’,
‘Variable19LAST_PRICE’,
‘Variable20OPEN’,
‘Variable20HIGH’,
‘Variable20LOW’,
‘Variable20LAST_PRICE’,
‘Variable21OPEN’,
‘Variable21HIGH’,
‘Variable21LOW’,
‘Variable21LAST_PRICE’,
‘Variable22OPEN’,
‘Variable22HIGH’,
‘Variable22LOW’,
‘Variable22LAST_PRICE’,
‘Variable23OPEN’,
‘Variable23HIGH’,
‘Variable23LOW’,
‘Variable23LAST_PRICE’,
‘Variable24OPEN’,
‘Variable24HIGH’,
‘Variable24LOW’,
‘Variable24LAST_PRICE’,
‘Variable25OPEN’,
‘Variable25HIGH’,
‘Variable25LOW’,
‘Variable25LAST_PRICE’,
‘Variable26OPEN’,
‘Variable26HIGH’,
‘Variable26LOW’,
‘Variable26LAST_PRICE’,
‘Variable27OPEN’,
‘Variable27HIGH’,
‘Variable27LOW’,
‘Variable27LAST_PRICE’,
‘Variable28OPEN’,
‘Variable28HIGH’,
‘Variable28LOW’,
‘Variable28LAST_PRICE’,
‘Variable29OPEN’,
‘Variable29HIGH’,
‘Variable29LOW’,
‘Variable29LAST_PRICE’,
‘Variable30OPEN’,
‘Variable30HIGH’,
‘Variable30LOW’,
‘Variable30LAST_PRICE’,
‘Variable31OPEN’,
‘Variable31HIGH’,
‘Variable31LOW’,
‘Variable31LAST_PRICE’,
‘Variable32OPEN’,
‘Variable32HIGH’,
‘Variable32LOW’,
‘Variable32LAST_PRICE’,
‘Variable33OPEN’,
‘Variable33HIGH’,
‘Variable33LOW’,
‘Variable33LAST_PRICE’,
‘Variable34OPEN’,
‘Variable34HIGH’,
‘Variable34LOW’,
‘Variable34LAST_PRICE’,
‘Variable35OPEN’,
‘Variable35HIGH’,
‘Variable35LOW’,
‘Variable35LAST_PRICE’,
‘Variable36OPEN’,
‘Variable36HIGH’,
‘Variable36LOW’,
‘Variable36LAST_PRICE’,
‘Variable37OPEN’,
‘Variable37HIGH’,
‘Variable37LOW’,
‘Variable37LAST_PRICE’,
‘Variable38OPEN’,
‘Variable38HIGH’,
‘Variable38LOW’,
‘Variable38LAST_PRICE’,
‘Variable40OPEN’,
‘Variable40HIGH’,
‘Variable40LOW’,
‘Variable40LAST_PRICE’,
‘Variable41OPEN’,
‘Variable41HIGH’,
‘Variable41LOW’,
‘Variable41LAST_PRICE’,
‘Variable42OPEN’,
‘Variable42HIGH’,
‘Variable42LOW’,
‘Variable42LAST_PRICE’,
‘Variable43OPEN’,
‘Variable43HIGH’,
‘Variable43LOW’,
‘Variable43LAST_PRICE’,
‘Variable44OPEN’,
‘Variable44HIGH’,
‘Variable44LOW’,
‘Variable44LAST_PRICE’,
‘Variable45OPEN’,
‘Variable45HIGH’,
‘Variable45LOW’,
‘Variable45LAST_PRICE’,
‘Variable46OPEN’,
‘Variable46HIGH’,
‘Variable46LOW’,
‘Variable46LAST_PRICE’,
‘Variable47OPEN’,
‘Variable47HIGH’,
‘Variable47LOW’,
‘Variable47LAST_PRICE’,
‘Variable48OPEN’,
‘Variable48HIGH’,
‘Variable48LOW’,
‘Variable48LAST_PRICE’,
‘Variable49OPEN’,
‘Variable49HIGH’,
‘Variable49LOW’,
‘Variable49LAST_PRICE’,
‘Variable50OPEN’,
‘Variable50HIGH’,
‘Variable50LOW’,
‘Variable50LAST_PRICE’,
‘Variable51OPEN’,
‘Variable51HIGH’,
‘Variable51LOW’,
‘Variable51LAST_PRICE’,
‘Variable52OPEN’,
‘Variable52HIGH’,
‘Variable52LOW’,
‘Variable52LAST_PRICE’,
‘Variable53OPEN’,
‘Variable53HIGH’,
‘Variable53LOW’,
‘Variable53LAST_PRICE’,
‘Variable54OPEN’,
‘Variable54HIGH’,
‘Variable54LOW’,
‘Variable54LAST_PRICE’,
‘Variable55OPEN’,
‘Variable55HIGH’,
‘Variable55LOW’,
‘Variable55LAST_PRICE’,
‘Variable56OPEN’,
‘Variable56HIGH’,
‘Variable56LOW’,
‘Variable56LAST_PRICE’,
‘Variable57OPEN’,
‘Variable57HIGH’,
‘Variable57LOW’,
‘Variable57LAST_PRICE’,
‘Variable58OPEN’,
‘Variable58HIGH’,
‘Variable58LOW’,
‘Variable58LAST_PRICE’,
‘Variable59OPEN’,
‘Variable59HIGH’,
‘Variable59LOW’,
‘Variable59LAST_PRICE’,
‘Variable60OPEN’,
‘Variable60HIGH’,
‘Variable60LOW’,
‘Variable60LAST_PRICE’,
‘Variable61OPEN’,
‘Variable61HIGH’,
‘Variable61LOW’,
‘Variable61LAST_PRICE’,
‘Variable62OPEN’,
‘Variable62HIGH’,
‘Variable62LOW’,
‘Variable62LAST_PRICE’,
‘Variable63OPEN’,
‘Variable63HIGH’,
‘Variable63LOW’,
‘Variable63LAST_PRICE’,
‘Variable64OPEN’,
‘Variable64HIGH’,
‘Variable64LOW’,
‘Variable64LAST_PRICE’,
‘Variable65OPEN’,
‘Variable65HIGH’,
‘Variable65LOW’,
‘Variable65LAST_PRICE’,
‘Variable68OPEN’,
‘Variable68HIGH’,
‘Variable68LOW’,
‘Variable68LAST_PRICE’,
‘Variable69OPEN’,
‘Variable69HIGH’,
‘Variable69LOW’,
‘Variable69LAST_PRICE’,
‘Variable70OPEN’,
‘Variable70HIGH’,
‘Variable70LOW’,
‘Variable70LAST_PRICE’,
‘Variable71OPEN’,
‘Variable71HIGH’,
‘Variable71LOW’,
‘Variable71LAST_PRICE’,
‘Variable72OPEN’,
‘Variable72HIGH’,
‘Variable72LOW’,
‘Variable72LAST_PRICE’,
‘Variable73OPEN’,
‘Variable73HIGH’,
‘Variable73LOW’,
‘Variable73LAST_PRICE’,
‘Variable74OPEN’,
‘Variable74HIGH’,
‘Variable74LOW’,
‘Variable74LAST_PRICE’,
‘Variable76OPEN’,
‘Variable76HIGH’,
‘Variable76LOW’,
‘Variable76LAST_PRICE’,
‘Variable77OPEN’,
‘Variable77HIGH’,
‘Variable77LOW’,
‘Variable77LAST_PRICE’,
‘Variable78OPEN’,
‘Variable78HIGH’,
‘Variable78LOW’,
‘Variable78LAST_PRICE’,
‘Variable79OPEN’,
‘Variable79HIGH’,
‘Variable79LOW’,
‘Variable79LAST_PRICE’,
‘Variable80OPEN’,
‘Variable80HIGH’,
‘Variable80LOW’,
‘Variable80LAST_PRICE’,
‘Variable81OPEN’,
‘Variable81HIGH’,
‘Variable81LOW’,
‘Variable81LAST_PRICE’,
‘Variable82OPEN’,
‘Variable82HIGH’,
‘Variable82LOW’,
‘Variable82LAST_PRICE’,
‘Variable83OPEN’,
‘Variable83HIGH’,
‘Variable83LOW’,
‘Variable83LAST_PRICE’,
‘Variable85OPEN’,
‘Variable85HIGH’,
‘Variable85LOW’,
‘Variable85LAST_PRICE’,
‘Variable86OPEN’,
‘Variable86HIGH’,
‘Variable86LOW’,
‘Variable86LAST_PRICE’,
‘Variable87OPEN’,
‘Variable87HIGH’,
‘Variable87LOW’,
‘Variable87LAST_PRICE’,
‘Variable88OPEN’,
‘Variable88HIGH’,
‘Variable88LOW’,
‘Variable88LAST_PRICE’,
‘Variable89OPEN’,
‘Variable89HIGH’,
‘Variable89LOW’,
‘Variable89LAST_PRICE’,
‘Variable90OPEN’,
‘Variable90HIGH’,
‘Variable90LOW’,
‘Variable90LAST_PRICE’,
‘Variable91OPEN’,
‘Variable91HIGH’,
‘Variable91LOW’,
‘Variable91LAST_PRICE’,
‘Variable92OPEN’,
‘Variable92HIGH’,
‘Variable92LOW’,
‘Variable92LAST_PRICE’,
‘Variable93OPEN’,
‘Variable93HIGH’,
‘Variable93LOW’,
‘Variable93LAST_PRICE’,
‘Variable94OPEN’,
‘Variable94HIGH’,
‘Variable94LOW’,
‘Variable94LAST_PRICE’,
‘Variable95OPEN’,
‘Variable95HIGH’,
‘Variable95LOW’,
‘Variable95LAST_PRICE’,
‘Variable97OPEN’,
‘Variable97HIGH’,
‘Variable97LOW’,
‘Variable97LAST_PRICE’,
‘Variable98OPEN’,
‘Variable98HIGH’,
‘Variable98LOW’,
‘Variable98LAST_PRICE’,
‘Variable99OPEN’,
‘Variable99HIGH’,
‘Variable99LOW’,
‘Variable99LAST_PRICE’,
‘Variable100OPEN’,
‘Variable100HIGH’,
‘Variable100LOW’,
‘Variable100LAST_PRICE’,
‘Variable101OPEN’,
‘Variable101HIGH’,
‘Variable101LOW’,
‘Variable101LAST_PRICE’,
‘Variable102OPEN’,
‘Variable102HIGH’,
‘Variable102LOW’,
‘Variable102LAST_PRICE’,
‘Variable103OPEN’,
‘Variable103HIGH’,
‘Variable103LOW’,
‘Variable103LAST_PRICE’,
‘Variable105OPEN’,
‘Variable105HIGH’,
‘Variable105LOW’,
‘Variable105LAST_PRICE’,
‘Variable107OPEN’,
‘Variable107HIGH’,
‘Variable107LOW’,
‘Variable107LAST_PRICE’,
‘Variable108OPEN’,
‘Variable108HIGH’,
‘Variable108LOW’,
‘Variable108LAST_PRICE’,
‘Variable109OPEN’,
‘Variable109HIGH’,
‘Variable109LOW’,
‘Variable109LAST_PRICE’,
‘Variable111OPEN’,
‘Variable111HIGH’,
‘Variable111LOW’,
‘Variable111LAST_PRICE’,
‘Variable112OPEN’,
‘Variable112HIGH’,
‘Variable112LOW’,
‘Variable112LAST_PRICE’,
‘Variable113OPEN’,
‘Variable113HIGH’,
‘Variable113LOW’,
‘Variable113LAST_PRICE’,
‘Variable114OPEN’,
‘Variable114HIGH’,
‘Variable114LOW’,
‘Variable114LAST_PRICE’,
‘Variable115OPEN’,
‘Variable115HIGH’,
‘Variable115LOW’,
‘Variable115LAST_PRICE’,
‘Variable116OPEN’,
‘Variable116HIGH’,
‘Variable116LOW’,
‘Variable116LAST_PRICE’,
‘Variable117OPEN’,
‘Variable117HIGH’,
‘Variable117LOW’,
‘Variable117LAST_PRICE’,
‘Variable120OPEN’,
‘Variable120HIGH’,
‘Variable120LOW’,
‘Variable120LAST_PRICE’,
‘Variable121OPEN’,
‘Variable121HIGH’,
‘Variable121LOW’,
‘Variable121LAST_PRICE’,
‘Variable123OPEN’,
‘Variable123HIGH’,
‘Variable123LOW’,
‘Variable123LAST_PRICE’,
‘Variable124OPEN’,
‘Variable124HIGH’,
‘Variable124LOW’,
‘Variable124LAST_PRICE’,
‘Variable125OPEN’,
‘Variable125HIGH’,
‘Variable125LOW’,
‘Variable125LAST_PRICE’,
‘Variable126OPEN’,
‘Variable126HIGH’,
‘Variable126LOW’,
‘Variable126LAST_PRICE’,
‘Variable127OPEN’,
‘Variable127HIGH’,
‘Variable127LOW’,
‘Variable127LAST_PRICE’,
‘Variable129OPEN’,
‘Variable129HIGH’,
‘Variable129LOW’,
‘Variable129LAST_PRICE’,
‘Variable130OPEN’,
‘Variable130HIGH’,
‘Variable130LOW’,
‘Variable130LAST_PRICE’,
‘Variable133OPEN’,
‘Variable133HIGH’,
‘Variable133LOW’,
‘Variable133LAST_PRICE’,
‘Variable136OPEN’,
‘Variable136HIGH’,
‘Variable136LOW’,
‘Variable136LAST_PRICE’,
‘Variable137OPEN’,
‘Variable137HIGH’,
‘Variable137LOW’,
‘Variable137LAST_PRICE’,
‘Variable139OPEN’,
‘Variable139HIGH’,
‘Variable139LOW’,
‘Variable139LAST_PRICE’,
‘Variable141OPEN’,
‘Variable141HIGH’,
‘Variable141LOW’,
‘Variable141LAST_PRICE’]
In [6]:
trainX = train[cols]
trainY = train[‘TargetVariable’]
testX = test[cols]
In [7]:
trainY
Out[7]:
0 1
1 1
2 1
3 1
4 0
5 1
6 0
7 0
8 0
9 0
10 0
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 1
20 1
21 1
22 1
23 1
24 1
25 1
26 1
27 0
28 1
29 1
..
5892 1
5893 1
5894 1
5895 1
5896 1
5897 1
5898 1
5899 1
5900 1
5901 1
5902 1
5903 1
5904 0
5905 0
5906 0
5907 0
5908 0
5909 0
5910 0
5911 0
5912 1
5913 1
5914 1
5915 1
5916 1
5917 1
5918 1
5919 1
5920 1
5921 1
Name: TargetVariable, dtype: int64
In [8]:
zscoreScaler = preprocessing.StandardScaler()
normTrainX = pd.DataFrame(zscoreScaler.fit_transform(trainX))
normTestX = pd.DataFrame(zscoreScaler.transform(testX))
random_state = np.random.RandomState(0)
In [9]:
def cross_validation_AUC(trainX, trainY, numFold, classifier, name):
plt.figure()
plt.clf()
kfold = KFold(trainY.shape[0], n_folds=numFold, shuffle=True)
true_positive_rate_mean = 0.0
false_positive_rate_mean = np.linspace(0, 1, 100)
for i, (train_part, validate_part) in enumerate(kfold):
predictProbability = classifier.fit(trainX.values[train_part], trainY.values[train_part]).predict_proba(trainX.values[validate_part])
false_positive_rate, true_positive_rate, thresholds = roc_curve(trainY.values[validate_part], predictProbability[:, 1])
true_positive_rate_mean += interp(false_positive_rate_mean, false_positive_rate, true_positive_rate)
true_positive_rate_mean[0] = 0.0
area_under_curve = auc(false_positive_rate, true_positive_rate)
plt.plot(false_positive_rate, true_positive_rate, lw=1, label=’Fold %d ROC (AUC = %0.3f)’ % (i, area_under_curve))
plt.plot([0, 1], [0, 1], ‘–‘, color=(0.3, 0.3, 0.3))
true_positive_rate_mean /= len(kfold)
true_positive_rate_mean[-1] = 1.0
mean_auc = auc(false_positive_rate_mean, true_positive_rate_mean)
plt.plot(false_positive_rate_mean, true_positive_rate_mean, ‘k–‘,
label=’Mean ROC (AUC = %0.3f)’ % mean_auc, lw=2)
extra = 0.03
plt.xlim([-extra, 1 + extra])
plt.ylim([-extra, 1 + extra])
plt.xlabel(‘False Positive Rate’)
plt.ylabel(‘True Positive Rate’)
plt.title(‘Receiver operating characteristic for ‘ + name)
plt.legend(loc=”lower right”)
plt.savefig(name + str( datetime.datetime.now()) + “.png”)
plt.show()
In [16]:
def trainAndTest(trainX, trainY, testX, classfier, name):
classfier.fit(trainX, trainY)
testYProb = classfier.predict_proba(testX)
testY = classfier.predict(testX)
outProb = pd.DataFrame({“Timestamp”: test[‘Timestamp’], “Score”: testYProb[:,1]})
outPred = pd.DataFrame({“Timestamp”: test[‘Timestamp’], “Prediction”: testY})
f = open(name + ‘_prob_’ + str( datetime.datetime.now()) + ‘.csv’, ‘w’)
f.write(‘Timestamp,Score\n’)
for i in range(outProb.shape[0]):
f.write(‘%f,%f\n’ %(outProb[‘Timestamp’][i], outProb[‘Score’][i]))
f.close()
f = open(name + ‘_pred’ + str( datetime.datetime.now()) + ‘.csv’, ‘w’)
f.write(‘Timestamp,Score\n’)
for i in range(outProb.shape[0]):
f.write(‘%f,%d\n’ %(outProb[‘Timestamp’][i], outPred[‘Prediction’][i]))
f.close()
return outProb, outPred
In [11]:
classifiers = [linear_model.LogisticRegression(class_weight=’balanced’),
svm.SVC(kernel=’linear’, probability=True,
random_state=random_state, class_weight=’balanced’)]
names = [‘Logistic Regression’, “SVM”]
In [12]:
cross_validation_AUC(trainX, trainY, 3, classifiers[0], names[0])
In [13]:
cross_validation_AUC(normTrainX, trainY, 3, classifiers[1], names[1])
In [19]:
outProb, outPred = trainAndTest(trainX, trainY, testX, classifiers[0], names[0])
In [20]:
outProb
Out[20]:
Score Timestamp
0 4.991359e-01 40289.659722
1 4.772883e-01 40289.663194
2 5.293711e-01 40289.666667
3 7.458748e-01 40290.395833
4 3.642648e-03 40290.399306
5 8.740669e-03 40290.402778
6 1.482215e-02 40290.406250
7 3.336558e-02 40290.409722
8 1.248093e-02 40290.413194
9 1.267349e-02 40290.416667
10 1.384272e-02 40290.420139
11 1.641626e-02 40290.423611
12 9.363629e-03 40290.427083
13 5.944474e-03 40290.430556
14 6.969733e-03 40290.434028
15 5.694134e-03 40290.437500
16 5.157876e-03 40290.440972
17 3.443934e-03 40290.444444
18 3.706470e-03 40290.447917
19 1.341812e-02 40290.451389
20 2.778245e-02 40290.454861
21 7.050519e-02 40290.458333
22 1.235218e-01 40290.461806
23 1.963404e-01 40290.465278
24 7.292517e-02 40290.468750
25 2.080013e-01 40290.472222
26 2.782143e-01 40290.475694
27 2.498681e-01 40290.479167
28 2.612405e-01 40290.482639
29 2.586164e-01 40290.486111
… … …
2509 1.230855e-09 40336.593750
2510 8.402939e-10 40336.597222
2511 6.962772e-10 40336.600694
2512 7.592883e-10 40336.604167
2513 5.664518e-10 40336.607639
2514 8.199473e-10 40336.611111
2515 1.173050e-09 40336.614583
2516 9.951754e-10 40336.618056
2517 1.593444e-09 40336.621528
2518 1.219115e-09 40336.625000
2519 8.164988e-10 40336.628472
2520 4.373459e-10 40336.631944
2521 4.732771e-10 40336.635417
2522 6.155041e-10 40336.638889
2523 2.345777e-10 40336.642361
2524 5.677176e-10 40336.645833
2525 4.727566e-10 40336.649306
2526 4.432033e-10 40336.652778
2527 3.487585e-10 40336.656250
2528 4.717321e-10 40336.659722
2529 5.292507e-10 40336.663194
2530 5.234629e-10 40336.666667
2531 5.180721e-09 40337.395833
2532 4.110906e-09 40337.399306
2533 1.292165e-09 40337.402778
2534 1.596810e-09 40337.406250
2535 1.935299e-09 40337.409722
2536 5.773091e-10 40337.413194
2537 6.269783e-10 40337.416667
2538 7.307164e-10 40337.420139
2539 rows × 2 columns
In [21]:
outPred
Out[21]:
Prediction Timestamp
0 0 40289.659722
1 0 40289.663194
2 1 40289.666667
3 1 40290.395833
4 0 40290.399306
5 0 40290.402778
6 0 40290.406250
7 0 40290.409722
8 0 40290.413194
9 0 40290.416667
10 0 40290.420139
11 0 40290.423611
12 0 40290.427083
13 0 40290.430556
14 0 40290.434028
15 0 40290.437500
16 0 40290.440972
17 0 40290.444444
18 0 40290.447917
19 0 40290.451389
20 0 40290.454861
21 0 40290.458333
22 0 40290.461806
23 0 40290.465278
24 0 40290.468750
25 0 40290.472222
26 0 40290.475694
27 0 40290.479167
28 0 40290.482639
29 0 40290.486111
… … …
2509 0 40336.593750
2510 0 40336.597222
2511 0 40336.600694
2512 0 40336.604167
2513 0 40336.607639
2514 0 40336.611111
2515 0 40336.614583
2516 0 40336.618056
2517 0 40336.621528
2518 0 40336.625000
2519 0 40336.628472
2520 0 40336.631944
2521 0 40336.635417
2522 0 40336.638889
2523 0 40336.642361
2524 0 40336.645833
2525 0 40336.649306
2526 0 40336.652778
2527 0 40336.656250
2528 0 40336.659722
2529 0 40336.663194
2530 0 40336.666667
2531 0 40337.395833
2532 0 40337.399306
2533 0 40337.402778
2534 0 40337.406250
2535 0 40337.409722
2536 0 40337.413194
2537 0 40337.416667
2538 0 40337.420139
2539 rows × 2 columns
In [22]:
outProb, outPred = trainAndTest(normTrainX, trainY, normTestX, classifiers[1], names[1])
In [23]:
outProb
Out[23]:
Score Timestamp
0 7.238655e-01 40289.659722
1 7.094830e-01 40289.663194
2 7.580122e-01 40289.666667
3 7.607310e-01 40290.395833
4 4.493909e-02 40290.399306
5 4.237883e-02 40290.402778
6 6.845894e-02 40290.406250
7 1.101531e-01 40290.409722
8 6.601131e-02 40290.413194
9 6.241476e-02 40290.416667
10 5.310275e-02 40290.420139
11 5.538706e-02 40290.423611
12 4.278023e-02 40290.427083
13 3.662490e-02 40290.430556
14 3.976104e-02 40290.434028
15 3.350457e-02 40290.437500
16 3.347852e-02 40290.440972
17 2.289975e-02 40290.444444
18 2.521271e-02 40290.447917
19 6.885694e-02 40290.451389
20 1.309110e-01 40290.454861
21 2.187853e-01 40290.458333
22 3.377530e-01 40290.461806
23 4.839792e-01 40290.465278
24 3.322275e-01 40290.468750
25 5.093680e-01 40290.472222
26 5.918260e-01 40290.475694
27 5.840974e-01 40290.479167
28 5.858289e-01 40290.482639
29 5.715075e-01 40290.486111
… … …
2509 1.000000e-07 40336.593750
2510 1.000000e-07 40336.597222
2511 1.000000e-07 40336.600694
2512 1.000000e-07 40336.604167
2513 1.000000e-07 40336.607639
2514 1.000000e-07 40336.611111
2515 1.000000e-07 40336.614583
2516 1.000000e-07 40336.618056
2517 1.000000e-07 40336.621528
2518 1.000000e-07 40336.625000
2519 1.000000e-07 40336.628472
2520 1.000000e-07 40336.631944
2521 1.000000e-07 40336.635417
2522 1.000000e-07 40336.638889
2523 1.000000e-07 40336.642361
2524 1.000000e-07 40336.645833
2525 1.000000e-07 40336.649306
2526 1.000000e-07 40336.652778
2527 1.000000e-07 40336.656250
2528 1.000000e-07 40336.659722
2529 1.000000e-07 40336.663194
2530 1.000000e-07 40336.666667
2531 1.000000e-07 40337.395833
2532 1.000000e-07 40337.399306
2533 1.000000e-07 40337.402778
2534 1.000000e-07 40337.406250
2535 1.000000e-07 40337.409722
2536 1.000000e-07 40337.413194
2537 1.000000e-07 40337.416667
2538 1.000000e-07 40337.420139
2539 rows × 2 columns
In [24]:
outPred
Out[24]:
Prediction Timestamp
0 1 40289.659722
1 1 40289.663194
2 1 40289.666667
3 1 40290.395833
4 0 40290.399306
5 0 40290.402778
6 0 40290.406250
7 0 40290.409722
8 0 40290.413194
9 0 40290.416667
10 0 40290.420139
11 0 40290.423611
12 0 40290.427083
13 0 40290.430556
14 0 40290.434028
15 0 40290.437500
16 0 40290.440972
17 0 40290.444444
18 0 40290.447917
19 0 40290.451389
20 0 40290.454861
21 0 40290.458333
22 0 40290.461806
23 0 40290.465278
24 0 40290.468750
25 0 40290.472222
26 0 40290.475694
27 0 40290.479167
28 0 40290.482639
29 0 40290.486111
… … …
2509 0 40336.593750
2510 0 40336.597222
2511 0 40336.600694
2512 0 40336.604167
2513 0 40336.607639
2514 0 40336.611111
2515 0 40336.614583
2516 0 40336.618056
2517 0 40336.621528
2518 0 40336.625000
2519 0 40336.628472
2520 0 40336.631944
2521 0 40336.635417
2522 0 40336.638889
2523 0 40336.642361
2524 0 40336.645833
2525 0 40336.649306
2526 0 40336.652778
2527 0 40336.656250
2528 0 40336.659722
2529 0 40336.663194
2530 0 40336.666667
2531 0 40337.395833
2532 0 40337.399306
2533 0 40337.402778
2534 0 40337.406250
2535 0 40337.409722
2536 0 40337.413194
2537 0 40337.416667
2538 0 40337.420139
2539 rows × 2 columns
In [ ]: