In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = “all”
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style(“whitegrid”)
sns.set_context(“notebook”)
#sns.set_context(“poster”)
In [2]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
Basic Classification Algorithms
Here we review six of the most well-known classification algorithms.
Two linear:
• Logistic Regression.
• Linear Discriminant Analysis.
and four non-linear:
• k-nn – k-Nearest Neighbors.
• Naive Bayes.
• CART – Classification and Regression Trees.
• SVM – Support Vector Machines.
Then we will address the simple and common question of What algorithms should I use in this dataset?
In all cases we will use a dataset that we are familiar with, the Pima Indians dataset, with a 10-fold cross-validation.

In this exercise we will use one of the traditional Machine Learning dataset, the Pima Indians diabetes dataset.
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
Content The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.
▪ Pregnancies
▪ Glucose
▪ BloodPressure
▪ SkinThickness
▪ Insulin
▪ BMI
▪ DiabetesPedigreeFunction (scores de likelihood of diabetes based on family history)
▪ Age
▪ Outcome
In [6]:
# Load the Pima indians dataset and separate input and output components
from numpy import set_printoptions
set_printoptions(precision=3)
filename=”pima-indians-diabetes.data.csv”
names=[“pregnancies”, “glucose”, “pressure”, “skin”, “insulin”, “bmi”, “pedi”, “age”, “outcome”]
p_indians=pd.read_csv(filename, names=names)
p_indians.head()
# First we separate into input and output components
array=p_indians.values
X=array[:,0:8]
y=array[:,8]
np.set_printoptions(suppress=True)
X
pd.DataFrame(X).head()
#Now we standarize our data
std_scaler=preprocessing.StandardScaler()
X_std=std_scaler.fit_transform(X)
minmax_scaler=preprocessing.MinMaxScaler()
X_minmax=minmax_scaler.fit_transform(X)
# Create the DataFrames for plotting
resall=pd.DataFrame()
res_w1=pd.DataFrame()
res_w2=pd.DataFrame()
res_w3=pd.DataFrame()
Out[6]:
pregnancies
glucose
pressure
skin
insulin
bmi
pedi
age
outcome
0
6
148
72
35
0
33.6
0.627
50
1
1
1
85
66
29
0
26.6
0.351
31
0
2
8
183
64
0
0
23.3
0.672
32
1
3
1
89
66
23
94
28.1
0.167
21
0
4
0
137
40
35
168
43.1
2.288
33
1
Out[6]:
array([[ 6. , 148. , 72. , …, 33.6 , 0.627, 50. ],
[ 1. , 85. , 66. , …, 26.6 , 0.351, 31. ],
[ 8. , 183. , 64. , …, 23.3 , 0.672, 32. ],
…,
[ 5. , 121. , 72. , …, 26.2 , 0.245, 30. ],
[ 1. , 126. , 60. , …, 30.1 , 0.349, 47. ],
[ 1. , 93. , 70. , …, 30.4 , 0.315, 23. ]])
Out[6]:
0
1
2
3
4
5
6
7
0
6.0
148.0
72.0
35.0
0.0
33.6
0.627
50.0
1
1.0
85.0
66.0
29.0
0.0
26.6
0.351
31.0
2
8.0
183.0
64.0
0.0
0.0
23.3
0.672
32.0
3
1.0
89.0
66.0
23.0
94.0
28.1
0.167
21.0
4
0.0
137.0
40.0
35.0
168.0
43.1
2.288
33.0
Logistic Regression¶
It is probably the best known and the oldest. We are also pretty familiar with it !
Logistic regression assumes a Gaussian distribution for the numeric input variables and can solve binary and multi-class classification problems.
We will use the LogisticRegression class.
In [7]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
kfold=KFold(n_splits=10, random_state=7)
model=LogisticRegression(solver=”liblinear”)
results=cross_val_score(model, X, y, cv=kfold)
print(f’Logistic Regression – Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}’)
results_scl=cross_val_score(model, X_std, y, cv=kfold)
print(f’Logistic Regression (-1..1) – Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}’)
results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)
print(f’Logistic Regression ( 0..1) – Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}’)
# if the range of variables is large scaling doesn’t matter in a log regression
# but if you are not sure if they are (or you don’t want to check … ) just try !
res_w1[“Res”]=results
res_w1[“Type”]=”log”
res_w2[“Res”]=results_scl
res_w2[“Type”]=”log -1..1″
res_w3[“Res”]=results_minmax
res_w3[“Type”]=”log 0..1″
resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)
Logistic Regression – Accuracy 76.951% std 4.841052
Logistic Regression (-1..1) – Accuracy 77.996% std 5.008801
Logistic Regression ( 0..1) – Accuracy 76.174% std 5.218473
LDA – Linear Discriminant Analysis¶
Linear Discriminant Analysis or discriminant analysis is a generalization of Fisher’s linear discriminant, originally developed by Ronald Fisher in 1936. Although it is different from ANOVA (Analysis of variance), they are closely related.
LDA also assumes a Gaussian distribution of the numerical input variables and can be used for binary or multi-class classification.
We will use the LinearDiscriminantAnalysis class.
In [8]:
# LDA – Linear Discriminant Analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
kfold=KFold(n_splits=10, random_state=7)
model=LinearDiscriminantAnalysis()
results=cross_val_score(model, X, y, cv=kfold)
print(f’LDA Linear Discriminant Analysis – Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}’)
results_scl=cross_val_score(model, X_std, y, cv=kfold)
print(f’LDA (-1..1) – Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}’)
results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)
print(f’LDA ( 0..1) – Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}’)
res_w1[“Res”]=results
res_w1[“Type”]=”LDA”
res_w2[“Res”]=results_scl
res_w2[“Type”]=”LDA -1..1″
res_w3[“Res”]=results_minmax
res_w3[“Type”]=”LDA 0..1″
resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)
LDA Linear Discriminant Analysis – Accuracy 77.346% std 5.159180
LDA (-1..1) – Accuracy 77.346% std 5.159180
LDA ( 0..1) – Accuracy 77.346% std 5.159180
k-nn k-Nearest Neighbors¶
k-Nearerst Neighbors is a non-linear machine learning algorithm that uses distance metrics to find the most similar k-elements, taking the meand outcome of the neighbors as the prediction.
One interesting advantage of this algorithm is that we can choose a different metric for calculating the distance. The default metric is Minkowski, equivalent to euclidean (with p=2). It can be easily transformed to Mnahattan distance with p=1.
For constructing a knn model you must use the KNeighorsClassifier class.
In [9]:
# KNN Classification
from sklearn.neighbors import KNeighborsClassifier
kfold=KFold(n_splits=10, random_state=7)
model=KNeighborsClassifier()
results=cross_val_score(model, X, y, cv=kfold)
print(f’KNN – Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}’)
results_scl=cross_val_score(model, X_std, y, cv=kfold)
print(f’KNN (-1..1) – Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}’)
results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)
print(f’KNN ( 0..1) – Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}’)
# scaling in knn is necessary …
res_w1[“Res”]=results
res_w1[“Type”]=”KNN”
res_w2[“Res”]=results_scl
res_w2[“Type”]=”KNN -1..1″
res_w3[“Res”]=results_minmax
res_w3[“Type”]=”KNN 0..1″
resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)
KNN – Accuracy 72.656% std 6.182131
KNN (-1..1) – Accuracy 74.214% std 7.149955
KNN ( 0..1) – Accuracy 74.475% std 6.056750
Naive Bayes¶
In Naive Bayes class labels are represented by a vector of features and each feature is considered independent of the others (the naive part of the name comes from this assumption). Probabilities are calculated following the bayesian approach.
In spite of its oversimplified assumptions, the algorithm works quite well in complex, real world situations. The algorithm is particularly usefull with small samples of data.
For Naive Bayes we will use the GaussianNB class.
In [10]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
kfold=KFold(n_splits=10, random_state=7)
model=GaussianNB()
results=cross_val_score(model, X, y, cv=kfold)
print(f’Naive Bayes – Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}’)
results_scl=cross_val_score(model, X_std, y, cv=kfold)
print(f’Naive Bayes (-1..1) – Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}’)
results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)
print(f’Naive Bayes ( 0..1) – Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}’)
res_w1[“Res”]=results
res_w1[“Type”]=”NB”
res_w2[“Res”]=results_scl
res_w2[“Type”]=”NB -1..1″
res_w3[“Res”]=results_minmax
res_w3[“Type”]=”NB 0..1″
resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)
Naive Bayes – Accuracy 75.518% std 4.276594
Naive Bayes (-1..1) – Accuracy 75.518% std 4.276594
Naive Bayes ( 0..1) – Accuracy 75.518% std 4.276594
CART – Classification and Regression Trees¶
Cart builds a binary tree from the data where the splits are chosen greedly evaluating all the attributes in order to minimize a cost function (Gini index or entropy typically).
They are the base for random forests and more sophisticated algorithms.
For CART we will use the DecisionTreeClassifier class.
In [ ]:
# Decision Trees
# Please observe that in this case repeating the algorithm gives different results
# scaling doesn’t matter in this case – you get different results but inside the range
from sklearn.tree import DecisionTreeClassifier
kfold=KFold(n_splits=10, random_state=7)
model=DecisionTreeClassifier()
results=cross_val_score(model, X, y, cv=kfold)
print(f’Decision Tree – Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}’)
results_scl=cross_val_score(model, X_std, y, cv=kfold)
print(f’Decision Tree (-1..1) – Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}’)
results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)
print(f’Decision Tree ( 0..1) – Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}’)
res_w1[“Res”]=results
res_w1[“Type”]=”DT”
res_w2[“Res”]=results_scl
res_w2[“Type”]=”DT -1..1″
res_w3[“Res”]=results_minmax
res_w3[“Type”]=”DT 0..1″
resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)
Support Vector Machines¶
Support vector machines seeks a line that separates best two classes. The data instances that are closest to this line are, better separating the classes, are called support vectors.
Support Vector Machines have the advantage that you can change the kernel function to use. Radial basis function is used by default, a pretty powerful one.
You can construct a SVM model with the SVC class.
In [ ]:
# SVM – Support Vector Machines
from sklearn.svm import SVC
kfold=KFold(n_splits=10, random_state=7)
model=SVC(gamma=”scale”)
results=cross_val_score(model, X, y, cv=kfold)
print(f’Support Vector Machines – Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}’)
results_scl=cross_val_score(model, X_std, y, cv=kfold)
print(f’SVM (-1..1) – Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}’)
results_minmax=cross_val_score(model, X_minmax, y, cv=kfold)
print(f’SVM ( 0..1) – Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}’)
# the importance of scaling depends on the kernel used
res_w1[“Res”]=results
res_w1[“Type”]=”SVM”
res_w2[“Res”]=results_scl
res_w2[“Type”]=”SVM -1..1″
res_w3[“Res”]=results_minmax
res_w3[“Type”]=”SVM 0..1″
resall=pd.concat([resall,res_w1,res_w2,res_w3], ignore_index=True)
Algorithm Comparison¶
In [11]:
# Now let’s compare them all
plt.figure(figsize=(15,9))
sns.boxplot(data=resall, x=”Type”, y=”Res”)
sns.swarmplot(data=resall, x=”Type”, y=”Res”, color=”royalblue”)
Out[11]:

In [ ]:
In [ ]:
In [ ]:
Mission 1
a) Do the same with the Times Ranking predicting to be among the 10 best Business Schools.
b) Try the Titanic dataset (you’ll find all the info that you need in Kaggle).
In [36]:
times=pd.read_csv(‘timesData.csv’,thousands=’,’)
times[‘international_students’] = times[‘international_students’].str.rstrip(‘%’).astype(‘float’) / 100.0
times=times.fillna(times.mean().round(1))
times=times.replace(‘-‘,0)
times_rank1=times[‘world_rank’].str.split(‘-‘,expand=True)
times_rank2=times[‘female_male_ratio’].str.split(‘:’,expand=True)
times[‘world_rank’]=times_rank1[0]
times[‘world_rank’]=pd.to_numeric(times[‘world_rank’],errors=’coerce’)
times[‘female_male_ratio’]=times_rank2[0]
times=times.fillna(0)
times=times.fillna(0)
t10=times.head(10)
t10
array=t10.values
a=array[:,3:]
b=array[:,0]
#standardize
std_scaler=preprocessing.StandardScaler()
a_std=std_scaler.fit_transform(a)
minmax_scaler=preprocessing.MinMaxScaler()
a_minmax=minmax_scaler.fit_transform(a)
allres=pd.DataFrame()
res01=pd.DataFrame()
res02=pd.DataFrame()
res03=pd.DataFrame()
Out[36]:
world_rank
university_name
country
teaching
international
research
citations
income
total_score
num_students
student_staff_ratio
international_students
female_male_ratio
year
0
1.0
Harvard University
United States of America
99.7
72.4
98.7
98.8
34.5
96.1
20152.0
8.9
0.25
0
2011
1
2.0
California Institute of Technology
United States of America
97.7
54.6
98.0
99.9
83.7
96.0
2243.0
6.9
0.27
33
2011
2
3.0
Massachusetts Institute of Technology
United States of America
97.8
82.3
91.4
99.9
87.5
95.6
11074.0
9.0
0.33
37
2011
3
4.0
Stanford University
United States of America
98.3
29.5
98.1
99.2
64.3
94.3
15596.0
7.8
0.22
42
2011
4
5.0
Princeton University
United States of America
90.9
70.3
95.4
99.9
0
94.2
7929.0
8.4
0.27
45
2011
5
6.0
University of Cambridge
United Kingdom
90.5
77.7
94.1
94.0
57.0
91.2
18812.0
11.8
0.34
46
2011
6
6.0
University of Oxford
United Kingdom
88.2
77.2
93.9
95.1
73.5
91.2
19919.0
11.6
0.34
46
2011
7
8.0
University of California, Berkeley
United States of America
84.2
39.6
99.3
97.8
0
91.1
36186.0
16.4
0.15
50
2011
8
9.0
Imperial College London
United Kingdom
89.2
90.0
94.5
88.3
92.9
90.6
15060.0
11.7
0.51
37
2011
9
10.0
Yale University
United States of America
92.1
59.2
89.7
91.5
0
89.5
11751.0
4.4
0.20
50
2011
In [38]:
from sklearn.linear_model import LogisticRegression
kfold=KFold(n_splits=10, random_state=7)
model=LogisticRegression(solver=”liblinear”)
results=cross_val_score(model, a, b, cv=kfold)
print(f’Logistic Regression – Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}’)
results_scl=cross_val_score(model, a_std, b, cv=kfold)
print(f’Logistic Regression (-1..1) – Accuracy {results_scl.mean()*100:.3f}% std {results_scl.std()*100:3f}’)
results_minmax=cross_val_score(model, a_minmax, b, cv=kfold)
print(f’Logistic Regression ( 0..1) – Accuracy {results_minmax.mean()*100:.3f}% std {results_minmax.std()*100:3f}’)
# if the range of variables is large scaling doesn’t matter in a log regression
# but if you are not sure if they are (or you don’t want to check … ) just try !
res01[“Res”]=results
res01[“Type”]=”log”
res02[“Res”]=results_scl
res02[“Type”]=”log -1..1″
res03[“Res”]=results_minmax
res03[“Type”]=”log 0..1″
allres=pd.concat([allres,res01,res02,res03], ignore_index=True)
D:\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:528: FutureWarning: From version 0.22, errors during fit will result in a cross validation score of NaN by default. Use error_score=’raise’ if you want an exception raised or error_score=np.nan to adopt the behavior from version 0.22.
FutureWarning)
—————————————————————————
ValueError Traceback (most recent call last)
5 model=LogisticRegression(solver=”liblinear”)
6
—-> 7 results=cross_val_score(model,a,b,cv=kfold)
8
9 print(f’Logistic Regression – Accuracy {results.mean()*100:.3f}% std {results.std()*100:3f}’)
D:\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
387 fit_params=fit_params,
388 pre_dispatch=pre_dispatch,
–> 389 error_score=error_score)
390 return cv_results[‘test_score’]
391
D:\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
229 return_times=True, return_estimator=return_estimator,
230 error_score=error_score)
–> 231 for train, test in cv.split(X, y, groups))
232
233 zipped_scores = list(zip(*scores))
D:\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
–> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
D:\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
–> 759 self._dispatch(tasks)
760 return True
761
D:\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
–> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
D:\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 “””Schedule a func to be run”””
–> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
D:\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
547 # Don’t delay the application, to avoid keeping the input
548 # arguments in memory
–> 549 self.results = batch()
550
551 def get(self):
D:\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
–> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
D:\anaconda3\lib\site-packages\joblib\parallel.py in
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
–> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
D:\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
512 estimator.fit(X_train, **fit_params)
513 else:
–> 514 estimator.fit(X_train, y_train, **fit_params)
515
516 except Exception as e:
D:\anaconda3\lib\site-packages\sklearn\linear_model\logistic.py in fit(self, X, y, sample_weight)
1531 X, y = check_X_y(X, y, accept_sparse=’csr’, dtype=_dtype, order=”C”,
1532 accept_large_sparse=solver != ‘liblinear’)
-> 1533 check_classification_targets(y)
1534 self.classes_ = np.unique(y)
1535 n_samples, n_features = X.shape
D:\anaconda3\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y)
167 if y_type not in [‘binary’, ‘multiclass’, ‘multiclass-multioutput’,
168 ‘multilabel-indicator’, ‘multilabel-sequences’]:
–> 169 raise ValueError(“Unknown label type: %r” % y_type)
170
171
ValueError: Unknown label type: ‘unknown’
In [ ]: