留学生辅导 TA Week 4_Logistic Regression

TA Week 4_Logistic Regression

Table of Contents

1 Logistic Regression 1.1 Dataset : Portugese Banking Institution
1.2 Logistic Regression

Logistic Regression¶

%matplotlib inline
from sklearn.linear_model import LogisticRegression #sklearn.linear_model contains all LinearRegression, Lasso, Ridge, ElasticNet
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns

Dataset : Portugese Banking Institution¶

df = pd.read_csv(‘deposit.csv’)

age job marital education default housing loan contact month day_of_week … campaign pdays previous poutcome emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed y
0 44 blue-collar married basic.4y unknown yes no cellular aug thu … 1 999 0 nonexistent 1.4 93.444 -36.1 4.963 5228.1 0
1 53 technician married unknown no no no cellular nov fri … 1 999 0 nonexistent -0.1 93.200 -42.0 4.021 5195.8 0
2 28 management single university.degree no yes no cellular jun thu … 3 6 2 success -1.7 94.055 -39.8 0.729 4991.6 1
3 39 services married high.school no no no cellular apr fri … 2 999 0 nonexistent -1.8 93.075 -47.1 1.405 5099.1 0
4 55 retired married basic.4y no yes no cellular aug fri … 1 3 1 success -2.9 92.201 -31.4 0.869 5076.2 1

5 rows × 21 columns

# General info of all the columns in our dataset

RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
# Column Non-Null Count Dtype
— —— ————– —–
0 age 41188 non-null int64
1 job 41188 non-null object
2 marital 41188 non-null object
3 education 41188 non-null object
4 default 41188 non-null object
5 housing 41188 non-null object
6 loan 41188 non-null object
7 contact 41188 non-null object
8 month 41188 non-null object
9 day_of_week 41188 non-null object
10 duration 41188 non-null int64
11 campaign 41188 non-null int64
12 pdays 41188 non-null int64
13 previous 41188 non-null int64
14 poutcome 41188 non-null object
15 emp_var_rate 41188 non-null float64
16 cons_price_idx 41188 non-null float64
17 cons_conf_idx 41188 non-null float64
18 euribor3m 41188 non-null float64
19 nr_employed 41188 non-null float64
20 y 41188 non-null int64
dtypes: float64(5), int64(6), object(10)
memory usage: 6.6+ MB

## Understanding Data
# Explore the balance of counts for target variable
df[‘y’].value_counts()

0 36548
1 4640
Name: y, dtype: int64

sns.countplot(x=’y’, data=df) # Comment : Unbalanced data

## Numerical Values by each category
df.groupby(‘y’).mean()

# Useful information? Age?
# pdays (number of days that passed by after the client was last contacted from a previous campaign)?

age duration campaign pdays previous emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed
0 39.911185 220.844807 2.633085 984.113878 0.132374 0.248875 93.603757 -40.593097 3.811491 5176.166600
1 40.913147 553.191164 2.051724 792.035560 0.492672 -1.233448 93.354386 -39.789784 2.123135 5095.115991

## Deposit distribution by each category
pd.crosstab(df.job,df.y).plot(kind=’bar’)
plt.title(‘Deposit Frequency by each Job’)
plt.xlabel(‘Job’)
plt.ylabel(‘Frequency of Purchase’)

Text(0, 0.5, ‘Frequency of Purchase’)

pd.crosstab(df.job,df.y)

admin. 9070 1352
blue-collar 8616 638
entrepreneur 1332 124
housemaid 954 106
management 2596 328
retired 1286 434
self-employed 1272 149
services 3646 323
student 600 275
technician 6013 730
unemployed 870 144
unknown 293 37

pd.crosstab(df.marital,df.y).plot(kind=’bar’)
plt.title(‘Deposit Frequency by marriage status’)
plt.xlabel(‘Job’)
plt.ylabel(‘Frequency of Purchase’)

# What if you wish to see in proportion?

Text(0, 0.5, ‘Frequency of Purchase’)

table = pd.crosstab(df.marital,df.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind=’bar’, stacked=True)
plt.title(‘Marital Status vs Purchase’)
plt.xlabel(‘Marital Status’)
plt.ylabel(‘Proportion of Deposits’)

Text(0, 0.5, ‘Proportion of Deposits’)

table.div(table.sum(1).astype(float), axis=0)

Basic 0.912971 0.087029
high.school 0.891645 0.108355
illiterate 0.777778 0.222222
professional.course 0.886515 0.113485
university.degree 0.862755 0.137245
unknown 0.854997 0.145003

# Education
table = pd.crosstab(df.education,df.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind=’bar’, stacked=True)
plt.title(‘Education vs Purchase’)
plt.xlabel(‘Education’)
plt.ylabel(‘Proportion of Deposits’)

Text(0, 0.5, ‘Proportion of Deposits’)

# Too many categories for education
df.loc[(df[‘education’]==’basic.9y’) | (df[‘education’]==’basic.6y’) | (df[‘education’]==’basic.4y’), ‘education’] = ‘Basic’

table = pd.crosstab(df.education,df.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind=’bar’, stacked=True)
plt.title(‘Education vs Purchase’)
plt.xlabel(‘Education’)
plt.ylabel(‘Proportion of Deposits’)

Text(0, 0.5, ‘Proportion of Deposits’)

## Constructing Dataset
# Numerical Variables
df_final_X = df[[‘age’, ‘duration’, ‘campaign’, ‘pdays’]]

# Dummy Variables for categorical variables : ‘Education’
from sklearn.preprocessing import OneHotEncoder
style = OneHotEncoder()
results = style.fit_transform(df[[‘education’]])
df_final_X = df_final_X.join(pd.DataFrame(results.toarray(), columns=style.get_feature_names([‘education’])))

df_final_X

age duration campaign pdays education_Basic education_high.school education_illiterate education_professional.course education_university.degree education_unknown
0 44 210 1 999 1.0 0.0 0.0 0.0 0.0 0.0
1 53 138 1 999 0.0 0.0 0.0 0.0 0.0 1.0
2 28 339 3 6 0.0 0.0 0.0 0.0 1.0 0.0
3 39 185 2 999 0.0 1.0 0.0 0.0 0.0 0.0
4 55 137 1 3 1.0 0.0 0.0 0.0 0.0 0.0
… … … … … … … … … … …
41183 59 222 1 999 0.0 1.0 0.0 0.0 0.0 0.0
41184 31 196 2 999 1.0 0.0 0.0 0.0 0.0 0.0
41185 42 62 3 999 0.0 0.0 0.0 0.0 1.0 0.0
41186 48 200 2 999 0.0 0.0 0.0 1.0 0.0 0.0
41187 25 112 4 999 0.0 1.0 0.0 0.0 0.0 0.0

41188 rows × 10 columns

df_final_Y = df[[‘y’]]

Logistic Regression¶

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(df_final_X,df_final_Y,test_size=0.33)

# Run logistic regression
logistic = LogisticRegression()
logistic.fit(X_train,Y_train)
print(“Test set score: {:.2f}”.format(logistic.score(X_test, Y_test))) # Accuracy Score

Test set score: 0.90

/Users/kgr0427/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py:72: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(**kwargs)

## Other Metrics? Precision and Recall
# Precision
from sklearn.metrics import precision_score
precision_score(Y_test, logistic.predict(X_test))

0.657608695652174

from sklearn.metrics import recall_score
recall_score(Y_test, logistic.predict(X_test))

0.3142857142857143

程序代写 CS代考加微信: powcoder QQ: 1823890830 Email: powcoder@163.com

Related Posts