程序代写代做代考 python decision tree In [ ]:

In [ ]:
import matplotlib.pyplot as plt
from itertools import product
import numpy as np
from collections import Counter
from sklearn.base import BaseEstimator, RegressorMixin, ClassifierMixin
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
import graphviz

from IPython.display import Image

%matplotlib inline

Load Data¶
In [ ]:
data_train = np.loadtxt(‘svm-train.txt’)
data_test = np.loadtxt(‘svm-test.txt’)
x_train, y_train = data_train[:, 0: 2], data_train[:, 2].reshape(-1, 1)
x_test, y_test = data_test[:, 0: 2], data_test[:, 2].reshape(-1, 1)
In [ ]:
# Change target to 0-1 label
y_train_label = np.array(list(map(lambda x: 1 if x > 0 else 0, y_train))).reshape(-1, 1)

Decision Tree Class¶
In [ ]:
class Decision_Tree(BaseEstimator):

def __init__(self, split_loss_function, leaf_value_estimator,
depth=0, min_sample=5, max_depth=10):
”’
Initialize the decision tree classifier

:param split_loss_function: method for splitting node
:param leaf_value_estimator: method for estimating leaf value
:param depth: depth indicator, default value is 0, representing root node
:param min_sample: an internal node can be splitted only if it contains points more than min_smaple
:param max_depth: restriction of tree depth.
”’
self.split_loss_function = split_loss_function
self.leaf_value_estimator = leaf_value_estimator
self.depth = depth
self.min_sample = min_sample
self.max_depth = max_depth

def fit(self, X, y=None):
”’
This should fit the tree classifier by setting the values self.is_leaf,
self.split_id (the index of the feature we want ot split on, if we’re splitting),
self.split_value (the corresponding value of that feature where the split is),
and self.value, which is the prediction value if the tree is a leaf node. If we are
splitting the node, we should also init self.left and self.right to be Decision_Tree
objects corresponding to the left and right subtrees. These subtrees should be fit on
the data that fall to the left and right,respectively, of self.split_value.
This is a recurisive tree building procedure.

:param X: a numpy array of training data, shape = (n, m)
:param y: a numpy array of labels, shape = (n, 1)

:return self
”’

# Your code goes here

return self

def predict_instance(self, instance):
”’
Predict label by decision tree

:param instance: a numpy array with new data, shape (1, m)

:return whatever is returned by leaf_value_estimator for leaf containing instance
”’
if self.is_leaf:
return self.value
if instance[self.split_id] <= self.split_value: return self.left.predict_instance(instance) else: return self.right.predict_instance(instance) Decision Tree Classifier¶ In [ ]: def compute_entropy(label_array): ''' Calulate the entropy of given label list :param label_array: a numpy array of labels shape = (n, 1) :return entropy: entropy value ''' # Your code goes here return entropy def compute_gini(label_array): ''' Calulate the gini index of label list :param label_array: a numpy array of labels shape = (n, 1) :return gini: gini index value ''' # Your code goes here return gini In [ ]: def most_common_label(y): ''' Find most common label ''' label_cnt = Counter(y.reshape(len(y))) label = label_cnt.most_common(1)[0][0] return label In [ ]: class Classification_Tree(BaseEstimator, ClassifierMixin): loss_function_dict = { 'entropy': compute_entropy, 'gini': compute_gini } def __init__(self, loss_function='entropy', min_sample=5, max_depth=10): ''' :param loss_function(str): loss function for splitting internal node ''' self.tree = Decision_Tree(self.loss_function_dict[loss_function], most_common_label, 0, min_sample, max_depth) def fit(self, X, y=None): self.tree.fit(X,y) return self def predict_instance(self, instance): value = self.tree.predict_instance(instance) return value Decision Tree Boundary¶ In [ ]: # Training classifiers with different depth clf1 = Classification_Tree(max_depth=1) clf1.fit(x_train, y_train_label) clf2 = Classification_Tree(max_depth=2) clf2.fit(x_train, y_train_label) clf3 = Classification_Tree(max_depth=3) clf3.fit(x_train, y_train_label) clf4 = Classification_Tree(max_depth=4) clf4.fit(x_train, y_train_label) clf5 = Classification_Tree(max_depth=5) clf5.fit(x_train, y_train_label) clf6 = Classification_Tree(max_depth=6) clf6.fit(x_train, y_train_label) # Plotting decision regions x_min, x_max = x_train[:, 0].min() - 1, x_train[:, 0].max() + 1 y_min, y_max = x_train[:, 1].min() - 1, x_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(2, 3, sharex='col', sharey='row', figsize=(10, 8)) for idx, clf, tt in zip(product([0, 1], [0, 1, 2]), [clf1, clf2, clf3, clf4, clf5, clf6], ['Depth = {}'.format(n) for n in range(1, 7)]): Z = np.array([clf.predict_instance(x) for x in np.c_[xx.ravel(), yy.ravel()]]) Z = Z.reshape(xx.shape) axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4) axarr[idx[0], idx[1]].scatter(x_train[:, 0], x_train[:, 1], c=y_train_label, alpha=0.8) axarr[idx[0], idx[1]].set_title(tt) plt.show() Compare decision tree with tree model in sklearn¶ In [ ]: clf = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=5) clf.fit(x_train, y_train_label) export_graphviz(clf, out_file='tree_classifier.dot') In [ ]: # Visualize decision tree !dot -Tpng tree_classifier.dot -o tree_classifier.png In [ ]: Image(filename='tree_classifier.png') Decision Tree Regressor¶ In [ ]: # Regression Tree Specific Code def mean_absolute_deviation_around_median(y): ''' Calulate the mean absolute deviation around the median of a given target list :param y: a numpy array of targets shape = (n, 1) :return mae ''' # Your code goes here return mae In [ ]: class Regression_Tree(): ''' :attribute loss_function_dict: dictionary containing the loss functions used for splitting :attribute estimator_dict: dictionary containing the estimation functions used in leaf nodes ''' loss_function_dict = { 'mse': np.var, 'mae': mean_absolute_deviation_around_median } estimator_dict = { 'mean': np.mean, 'median': np.median } def __init__(self, loss_function='mse', estimator='mean', min_sample=5, max_depth=10): ''' Initialize Regression_Tree :param loss_function(str): loss function used for splitting internal nodes :param estimator(str): value estimator of internal node ''' self.tree = Decision_Tree(self.loss_function_dict[loss_function], self.estimator_dict[estimator], 0, min_sample, max_depth) def fit(self, X, y=None): self.tree.fit(X,y) return self def predict_instance(self, instance): value = self.tree.predict_instance(instance) return value Fit regression tree to one-dimensional regression data¶ In [ ]: data_krr_train = np.loadtxt('krr-train.txt') data_krr_test = np.loadtxt('krr-test.txt') x_krr_train, y_krr_train = data_krr_train[:,0].reshape(-1,1),data_krr_train[:,1].reshape(-1,1) x_krr_test, y_krr_test = data_krr_test[:,0].reshape(-1,1),data_krr_test[:,1].reshape(-1,1) # Training regression trees with different depth clf1 = Regression_Tree(max_depth=1, min_sample=1, loss_function='mae', estimator='median') clf1.fit(x_krr_train, y_krr_train) clf2 = Regression_Tree(max_depth=2, min_sample=1, loss_function='mae', estimator='median') clf2.fit(x_krr_train, y_krr_train) clf3 = Regression_Tree(max_depth=3, min_sample=1, loss_function='mae', estimator='median') clf3.fit(x_krr_train, y_krr_train) clf4 = Regression_Tree(max_depth=4, min_sample=1, loss_function='mae', estimator='median') clf4.fit(x_krr_train, y_krr_train) clf5 = Regression_Tree(max_depth=5, min_sample=1, loss_function='mae', estimator='median') clf5.fit(x_krr_train, y_krr_train) clf6 = Regression_Tree(max_depth=6, min_sample=1, loss_function='mae', estimator='median') clf6.fit(x_krr_train, y_krr_train) plot_size = 0.001 x_range = np.arange(0., 1., plot_size).reshape(-1, 1) f2, axarr2 = plt.subplots(2, 3, sharex='col', sharey='row', figsize=(15, 10)) for idx, clf, tt in zip(product([0, 1], [0, 1, 2]), [clf1, clf2, clf3, clf4, clf5, clf6], ['Depth = {}'.format(n) for n in range(1, 7)]): y_range_predict = np.array([clf.predict_instance(x) for x in x_range]).reshape(-1, 1) axarr2[idx[0], idx[1]].plot(x_range, y_range_predict, color='r') axarr2[idx[0], idx[1]].scatter(x_krr_train, y_krr_train, alpha=0.8) axarr2[idx[0], idx[1]].set_title(tt) axarr2[idx[0], idx[1]].set_xlim(0, 1) plt.show() Gradient Boosting Method¶ In [ ]: #Pseudo-residual function. #Here you can assume that we are using L2 loss def pseudo_residual_L2(train_target, train_predict): ''' Compute the pseudo-residual based on current predicted value. ''' return train_target - train_predict In [ ]: class gradient_boosting(): ''' Gradient Boosting regressor class :method fit: fitting model ''' def __init__(self, n_estimator, pseudo_residual_func, learning_rate=0.1, min_sample=5, max_depth=3): ''' Initialize gradient boosting class :param n_estimator: number of estimators (i.e. number of rounds of gradient boosting) :pseudo_residual_func: function used for computing pseudo-residual :param learning_rate: step size of gradient descent ''' self.n_estimator = n_estimator self.pseudo_residual_func = pseudo_residual_func self.learning_rate = learning_rate self.min_sample = min_sample self.max_depth = max_depth def fit(self, train_data, train_target): ''' Fit gradient boosting model ''' # Your code goes here def predict(self, test_data): ''' Predict value ''' # Your code goes here 2-D GBM visualization - SVM data¶ In [ ]: # Plotting decision regions x_min, x_max = x_train[:, 0].min() - 1, x_train[:, 0].max() + 1 y_min, y_max = x_train[:, 1].min() - 1, x_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(2, 3, sharex='col', sharey='row', figsize=(10, 8)) for idx, i, tt in zip(product([0, 1], [0, 1, 2]), [1, 5, 10, 20, 50, 100], ['n_estimator = {}'.format(n) for n in [1, 5, 10, 20, 50, 100]]): gbt = gradient_boosting(n_estimator=i, pseudo_residual_func=pseudo_residual_L2, max_depth=2) gbt.fit(x_train, y_train) Z = np.sign(gbt.predict(np.c_[xx.ravel(), yy.ravel()])) Z = Z.reshape(xx.shape) axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4) axarr[idx[0], idx[1]].scatter(x_train[:, 0], x_train[:, 1], c=y_train_label, alpha=0.8) axarr[idx[0], idx[1]].set_title(tt) 1-D GBM visualization - KRR data¶ In [ ]: plot_size = 0.001 x_range = np.arange(0., 1., plot_size).reshape(-1, 1) f2, axarr2 = plt.subplots(2, 3, sharex='col', sharey='row', figsize=(15, 10)) for idx, i, tt in zip(product([0, 1], [0, 1, 2]), [1, 5, 10, 20, 50, 100], ['n_estimator = {}'.format(n) for n in [1, 5, 10, 20, 50, 100]]): gbm_1d = gradient_boosting(n_estimator=i, pseudo_residual_func=pseudo_residual_L2, max_depth=2) gbm_1d.fit(x_krr_train, y_krr_train) y_range_predict = gbm_1d.predict(x_range) axarr2[idx[0], idx[1]].plot(x_range, y_range_predict, color='r') axarr2[idx[0], idx[1]].scatter(x_krr_train, y_krr_train, alpha=0.8) axarr2[idx[0], idx[1]].set_title(tt) axarr2[idx[0], idx[1]].set_xlim(0, 1)