程序代写代做代考 python train_tree

train_tree

In [1]:

%matplotlib inline
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import graphviz
from sklearn import tree
import graphviz

from sklearn.model_selection import cross_val_score

def trainDecision(data0, max_depth = 3):

clf = tree.DecisionTreeClassifier(max_depth = max_depth, random_state = 0)
scores = cross_val_score(clf, data0[[“Entropy”, “SATD”]], data0[“SplitFlag”], cv=5)
print(scores)
# data0 = data[data[“Depth”] == 0]
clf0 = tree.DecisionTreeClassifier(max_depth = max_depth, random_state = 0)

clf0 = clf0.fit(data0[[“Entropy”, “SATD”]], data0[“SplitFlag”])
pred = clf0.predict(data0[[“Entropy”, “SATD”]])
acc = np.sum(pred == data0[“SplitFlag”] ) * 1.0 / data0.shape[0]
print(acc)

return clf0

def renderTree(clf0, name):

dot_data = tree.export_graphviz(clf0, out_file=None,
filled=True, rounded=True,
feature_names = [“Entropy”, “SATD”],
special_characters=True)

graph = graphviz.Source(dot_data)

graph.render(name)

return graph

In [3]:

data = pd.read_csv(‘tongji.txt’, sep=” “)

//anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: ParserWarning: Falling back to the ‘python’ engine because the ‘c’ engine does not support regex separators (separators > 1 char and different from ‘\s+’ are interpreted as regex); you can avoid this warning by specifying engine=’python’.
if __name__ == ‘__main__’:

In [4]:

data[data[“Depth”] == 0][[“SATD”, “SplitFlag”]].groupby(“SplitFlag”).boxplot(column = “SATD”)

Out[4]:

0 Axes(0.1,0.15;0.363636×0.75)
1 Axes(0.536364,0.15;0.363636×0.75)
dtype: object

In [5]:

data[data[“Depth”] == 0][[“Entropy”, “SplitFlag”]].groupby(“SplitFlag”).boxplot(column = “Entropy”)

Out[5]:

0 Axes(0.1,0.15;0.363636×0.75)
1 Axes(0.536364,0.15;0.363636×0.75)
dtype: object

In [6]:

data[data[“Depth”] == 1][[“SATD”, “SplitFlag”]].groupby(“SplitFlag”).boxplot(column = “SATD”)

Out[6]:

0 Axes(0.1,0.15;0.363636×0.75)
1 Axes(0.536364,0.15;0.363636×0.75)
dtype: object

In [7]:

data[data[“Depth”] == 1][[“Entropy”, “SplitFlag”]].groupby(“SplitFlag”).boxplot(column = “Entropy”)

Out[7]:

0 Axes(0.1,0.15;0.363636×0.75)
1 Axes(0.536364,0.15;0.363636×0.75)
dtype: object

In [8]:

data[data[“Depth”] == 2][[“SATD”, “SplitFlag”]].groupby(“SplitFlag”).boxplot(column = “SATD”)

Out[8]:

0 Axes(0.1,0.15;0.363636×0.75)
1 Axes(0.536364,0.15;0.363636×0.75)
dtype: object

In [9]:

data[data[“Depth”] == 2][[“Entropy”, “SplitFlag”]].groupby(“SplitFlag”).boxplot(column = “Entropy”)

Out[9]:

0 Axes(0.1,0.15;0.363636×0.75)
1 Axes(0.536364,0.15;0.363636×0.75)
dtype: object

In [11]:

clf0 = trainDecision(data[data[“Depth”] == 0], max_depth = 3)
renderTree(clf0, “tree0”)

[ 0.94280361 0.95958878 0.96472905 0.94952921 0.94261145]
0.951377786318

Out[11]:

SATD ≤ 16127.5
gini = 0.108
samples = 520400
value = [29840, 490560]

Entropy ≤ 2.776
gini = 0.463
samples = 55710
value = [20232, 35478]

True

Entropy ≤ 2.627
gini = 0.04
samples = 464690
value = [9608, 455082]

False

SATD ≤ 11117.0
gini = 0.323
samples = 16588
value = [3363, 13225]

SATD ≤ 11254.5
gini = 0.491
samples = 39122
value = [16869, 22253]

gini = 0.357
samples = 14470
value = [3362, 11108]

gini = 0.001
samples = 2118
value = [1, 2117]

gini = 0.411
samples = 9415
value = [6698, 2717]

gini = 0.45
samples = 29707
value = [10171, 19536]

SATD ≤ 108510.5
gini = 0.071
samples = 600
value = [578, 22]

SATD ≤ 35548.5
gini = 0.038
samples = 464090
value = [9030, 455060]

gini = 0.116
samples = 258
value = [242, 16]

gini = 0.034
samples = 342
value = [336, 6]

gini = 0.119
samples = 91519
value = [5835, 85684]

gini = 0.017
samples = 372571
value = [3195, 369376]

In [ ]:

In [ ]:

In [ ]:

In [12]:

clf1 = trainDecision(data[data[“Depth”] == 1], max_depth = 3)
renderTree(clf1, “tree1”)

[ 0.78521169 0.78829546 0.83894467 0.81328464 0.78288734]
0.816358042922

Out[12]:

SATD ≤ 4894.5
gini = 0.42
samples = 2031600
value = [608367, 1423233]

SATD ≤ 3713.5
gini = 0.351
samples = 416167
value = [321617, 94550]

True

SATD ≤ 9060.5
gini = 0.292
samples = 1615433
value = [286750, 1328683]

False

SATD ≤ 1508.5
gini = 0.257
samples = 291459
value = [247224, 44235]

Entropy ≤ 3.456
gini = 0.481
samples = 124708
value = [74393, 50315]

gini = 0.499
samples = 17776
value = [9283, 8493]

gini = 0.227
samples = 273683
value = [237941, 35742]

gini = 0.482
samples = 28631
value = [11627, 17004]

gini = 0.453
samples = 96077
value = [62766, 33311]

Entropy ≤ 4.576
gini = 0.44
samples = 444910
value = [145393, 299517]

SATD ≤ 27757.5
gini = 0.212
samples = 1170523
value = [141357, 1029166]

gini = 0.411
samples = 372102
value = [107571, 264531]

gini = 0.499
samples = 72808
value = [37822, 34986]

gini = 0.23
samples = 998045
value = [132495, 865550]

gini = 0.097
samples = 172478
value = [8862, 163616]

In [13]:

clf2 = trainDecision(data[data[“Depth”] == 2], max_depth = 4)
renderTree(clf2, “tree2”)

[ 0.70355021 0.78771725 0.79588223 0.77850255 0.71624517]
0.77042409388

Out[13]:

SATD ≤ 1965.5
gini = 0.476
samples = 8078400
value = [4928600, 3149800]

SATD ≤ 1243.5
gini = 0.218
samples = 3939038
value = [3449453, 489585]

True

SATD ≤ 3499.5
gini = 0.459
samples = 4139362
value = [1479147, 2660215]

False

SATD ≤ 976.5
gini = 0.079
samples = 2493786
value = [2390994, 102792]

SATD ≤ 1574.5
gini = 0.392
samples = 1445252
value = [1058459, 386793]

SATD ≤ 717.5
gini = 0.036
samples = 1872875
value = [1838801, 34074]

SATD ≤ 1095.5
gini = 0.197
samples = 620911
value = [552193, 68718]

gini = 0.008
samples = 1049262
value = [1044921, 4341]

gini = 0.07
samples = 823613
value = [793880, 29733]

gini = 0.157
samples = 295413
value = [270067, 25346]

gini = 0.231
samples = 325498
value = [282126, 43372]

SATD ≤ 1420.5
gini = 0.325
samples = 694623
value = [552767, 141856]

Entropy ≤ 4.46
gini = 0.44
samples = 750629
value = [505692, 244937]

gini = 0.294
samples = 376228
value = [308844, 67384]

gini = 0.358
samples = 318395
value = [243923, 74472]

gini = 0.458
samples = 540657
value = [348772, 191885]

gini = 0.378
samples = 209972
value = [156920, 53052]

SATD ≤ 2478.5
gini = 0.499
samples = 1852286
value = [893069, 959217]

SATD ≤ 7339.5
gini = 0.381
samples = 2287076
value = [586078, 1700998]

Entropy ≤ 4.299
gini = 0.494
samples = 775405
value = [431108, 344297]

Entropy ≤ 4.58
gini = 0.49
samples = 1076881
value = [461961, 614920]

gini = 0.494
samples = 250375
value = [111530, 138845]

gini = 0.476
samples = 525030
value = [319578, 205452]

gini = 0.421
samples = 280029
value = [84225, 195804]

gini = 0.499
samples = 796852
value = [377736, 419116]

Entropy ≤ 4.955
gini = 0.415
samples = 1793615
value = [527084, 1266531]

SATD ≤ 9903.5
gini = 0.211
samples = 493461
value = [58994, 434467]

gini = 0.262
samples = 303721
value = [47077, 256644]

gini = 0.437
samples = 1489894
value = [480007, 1009887]

gini = 0.265
samples = 303640
value = [47666, 255974]

gini = 0.112
samples = 189821
value = [11328, 178493]