# Python functions for Statistical Learning
# , The University of School
# August 2017
Copyright By PowCoder代写 加微信 powcoder
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
def rmse_jack(response, predicted):
y = np.array((np.ravel(response)-np.ravel(predicted))**2)
y_sum = np.sum(y)
n = len(y)
resample = np.sqrt((y_sum-y)/(n-1))
rmse = np.sqrt(y_sum/n)
se = np.sqrt((n-1)*np.var(resample))
return rmse, se
def r2_jack(response, predicted):
e2 = np.array((np.ravel(response)-np.ravel(predicted))**2)
y2 = np.array((np.ravel(response)-np.mean(np.ravel(response)))**2)
rss = np.sum(e2)
tss = np.sum(y2)
n = len(e2)
resample = 1-(rss-e2)/(tss-y2)
r2 = 1-rss/tss
se = np.sqrt((n-1)*np.var(resample))
return r2, se
def forwardselection(X, y):
“””Forward variable selection based on the Scikit learn API
———————————————————————————-
Scikit learn OLS regression object for the best model
# Functions
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
# Initialisation
p = X.shape[1]
candidates = list(np.arange(p))
# Forward recursion
bestcvscore=-np.inf
while i<=p:
bestscore = 0
for variable in candidates:
ols = LinearRegression()
ols.fit(X.iloc[:, base + [variable]], y)
score = ols.score(X.iloc[:, base + [variable]], y)
if score > bestscore:
bestscore = score
best = ols
newvariable=variable
base.append(newvariable)
candidates.remove(newvariable)
cvscore = cross_val_score(best, X.iloc[:, base], y, scoring=’neg_mean_squared_error’).mean()
if cvscore > bestcvscore:
bestcvscore=cvscore
bestcv = best
subset = base[:]
return bestcv, subset
class forward:
def __init__(self):
def fit(self, X, y):
self.ols, self.subset = forwardselection(X, y)
def predict(self, X):
return self.ols.predict(X.iloc[:, self.subset])
def cv_score(self, X, y, cv=10):
from sklearn.model_selection import cross_val_score
scores = cross_val_score(self.ols, X.iloc[:, self.subset], np.ravel(y), cv=cv, scoring=’neg_mean_squared_error’)
return np.sqrt(-1*np.mean(scores))
class PCR:
def __init__(self, M=1):
def fit(self, X, y):
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
self.pca=PCA(n_components=self.M)
Z= self.pca.fit_transform(X)
self.pcr = LinearRegression().fit(Z, y)
def predict(self, X):
return self.pcr.predict(self.pca.transform(X))
def cv_score(self, X, y, cv=10):
from sklearn.model_selection import cross_val_score
Z=self.pca.transform(X)
scores = cross_val_score(self.pcr, Z, np.ravel(y), cv=cv, scoring=’neg_mean_squared_error’).mean()
return np.sqrt(-1*np.mean(scores))
def pcrCV(X, y):
# Approximate cross-validation
from sklearn.model_selection import cross_val_score
p=X.shape[1]
bestscore= -np.inf
cv_scores = []
for m in range(1,p+1):
model = PCR(M=m)
model.fit(X, y)
Z=model.pca.transform(X)
score = cross_val_score(model.pcr, Z, y, cv=10, scoring=’neg_mean_squared_error’).mean()
cv_scores.append(score)
if score > bestscore:
bestscore=score
best=model
best.cv_scores = pd.Series(cv_scores, index = np.arange(1,p+1))
return best
def plsCV(X, y):
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_score
p=X.shape[1]
bestscore=-np.inf
for m in range(1,p): # not fitting with M=p avoids occasional problems
pls = PLSRegression(n_components=m).fit(X, y)
score = cross_val_score(pls, X, y, cv=10, scoring=’neg_mean_squared_error’).mean()
if score > bestscore:
bestscore=score
return best
import matplotlib.pyplot as plt
import itertools
# This function is from the scikit-learn documentation
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
normalize=False,
title=’Confusion matrix’,
cmap=plt.cm.Blues):
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
if normalize:
cm = cm.astype(‘float’) / cm.sum(axis=1)[:, np.newaxis]
print(“Normalized confusion matrix”)
print(‘Confusion matrix, without normalization’)
#print(cm)
plt.imshow(cm, interpolation=’nearest’, cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = ‘.3f’ if normalize else ‘d’
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment=”center”,
color=”white” if cm[i, j] > thresh else “black”)
plt.tight_layout()
plt.ylabel(‘True label’)
plt.xlabel(‘Predicted label’)
程序代写 CS代考 加微信: powcoder QQ: 1823890830 Email: powcoder@163.com