Homework Task 1
Advanced Analytics (QBUS3830)
Homework Task 1
In [1]:
import numpy as np
from scipy import stats
1. Logistic Regression¶
Bernoulli distribution¶
In [2]:
def bernoulli_log_like(pi, y):
loglik = y*np.log(pi)+(1-y)*np.log(1-pi)
return np.sum(loglik)
def bernoulli_score(pi, y):
return y/pi-(1-y)/(1-pi)
Link function, inverse link function, and derivative¶
In [3]:
from scipy.special import logit, expit
def expit_derivative(x):
y = expit(x)
return y*(1-y)
Loss function for minimisation¶
In [5]:
def bernoulli_loss(pi, y):
n = len(y)
return -(bernoulli_log_like(pi, y)/n)
def bernoulli_logit_loss(nu, y):
pi = expit(nu)
return bernoulli_loss(pi, y)
def logistic_reg_loss(pi,y):
return (-y * np.log(pi) – (1 – y) * np.log(1 – pi)).mean()
Gradient of the loss function¶
In [6]:
def logistic_reg_loss_jac(pi, y): # complete the function head
n = len(y)
# something goes here
return (-y * np.log(pi) – (1 – y) * np.log(1 – pi)).mean()
pi = expit(nu)
jac = -bernoulli_score(pi, y)/n
jac = jac*expit_derivative(nu)
# keep working here, you must use the chain rule
# pay attention to the required output
return logistic_reg_loss_jac(pi, y)*expit_derivative(nu)
Logistic regression class¶
In [ ]:
class LogisticRegression:
def __init__(self):
pass
def fit(self, X_train, y_train):
n = len(y_train)
# there is code that goes here
output= minimize(logistic_reg_loss, start, args=(X_train, y_train), jac = logistic_reg_jac, tol=1e-6, method=’L-BFGS-B’)
self.output = output
self.coef = output.x # logistic regression coefficients
self.se=np.sqrt(np.diag((1/n)*output.hess_inv.todense()))
def predict(self, X):
# predicts one or zero given the predictor values, complete the function
# predict one if the conditionnal probability is higher than 0.5
pass
def predict_proba(self, X):
# computes the conditional probability given the predictor values, complete the function
pass
2. Test code¶
2.1 Data and benchmark¶
In [2]:
import pandas as pd
data = pd.read_csv(‘Data\Acquisition.csv’, index_col=0)
data.head()
Out[2]:
Acquisition Acq_Expense Industry Revenue Employees
Customer
1 1 760.36 1 30.16 1240
2 0 147.70 1 39.80 166
3 0 252.56 1 54.93 1016
4 1 609.73 1 45.83 122
5 1 672.36 1 69.03 313
In [4]:
import statsmodels.api as sm
y_train= data[‘Acquisition’]
X_train= sm.add_constant(data.iloc[:,1:])
In [6]:
sm_logit = sm.Logit(y_train, X_train).fit()
print(sm_logit.summary())
Optimization terminated successfully.
Current function value: 0.217643
Iterations 9
Logit Regression Results
==============================================================================
Dep. Variable: Acquisition No. Observations: 500
Model: Logit Df Residuals: 495
Method: MLE Df Model: 4
Date: Tue, 28 Aug 2018 Pseudo R-squ.: 0.6795
Time: 13:20:39 Log-Likelihood: -108.82
converged: True LL-Null: -339.48
LLR p-value: 1.546e-98
===============================================================================
coef std err z P>|z| [0.025 0.975]
——————————————————————————-
const -13.5205 1.497 -9.033 0.000 -16.454 -10.587
Acq_Expense 0.0215 0.002 9.678 0.000 0.017 0.026
Industry 0.1127 0.364 0.310 0.757 -0.601 0.826
Revenue 0.0274 0.011 2.471 0.013 0.006 0.049
Employees 0.0038 0.001 6.961 0.000 0.003 0.005
===============================================================================
2.2 Testing your implementation¶
It’s time to test your code. Do you get the same coefficients and standard errors as the package, or least very similar?
In [ ]:
logit = LogisticRegression()
logit.fit(X_train, y_train) # following the Scikit-Learn syntax
print(logit.coef)
print(logit.se)
3. Robit model¶
The Robit model is identical to the logistic regression model, except that the link function is the inverse of the Student’s t CDF with fixed degrees of freedom. Adapt the code from above to estimate it, but you will have to go back to the code from the lecturer to find the optimisation approach that will work for this model.
In [ ]:
class Robit:
def __init__(self, nu=5):
self.nu = nu # degrees of freedom
def fit(self, X_train, y_train):
n = len(y_train)
# complete the function
self.se=np.sqrt(np.diag((1/n)*output.hess_inv.todense()))
def predict(self, X):
# predicts one or zero given the predictor values, complete the function
# predict one if the conditionnal probability is higher than 0.5
pass
def predict_proba(self, X):
# computes the conditional probability given the predictor values, complete the function
pass
4. Predictive analytics¶
4.1 Train-Test Split¶
In [ ]:
from sklearn.model_selection import train_test_split
response=’Acquisition’
predictors = list(data.columns.values)
predictors.remove(response)
data[‘Acq_Expense’] = np.log(data[‘Acq_Expense’])
index_train, index_test = train_test_split(np.array(data.index), stratify=data[response], train_size=0.7, random_state=5)
y_train = data[response].copy()
y_test = data[response].copy()
X_train = data.loc[index_train, predictors].copy()
X_test = data.loc[index_test, predictors].copy()
4.2 Maximum Likelihood Estimation¶
In [ ]:
logit = LogisticRegression()
logit.fit(X_train, y_train)
robit = Robit(nu=5) # you can experiment with different degrees of freedom
robit.fit(X_train, y_train)
4.3 Model Evaluation¶
In [ ]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, precision_score
columns=[‘Accuracy’, ‘Sensitivity’, ‘Specificity’, ‘AUC’, ‘Precision’]
rows=[‘Logistic regression’, ‘Robit’]
results=pd.DataFrame(0.0, columns=columns, index=rows)
methods=[logit, robit]
for i, method in enumerate(methods):
y_pred = method.predict(X_test)
y_prod = method.predict_proba(X_test)
confusion = confusion_matrix(y_test, y_pred)
results.iloc[i,0]= accuracy_score(y_test, y_pred)
results.iloc[i,1]= confusion[1,1]/np.sum(confusion[1,:])
results.iloc[i,2]= confusion[0,0]/np.sum(confusion[0,:])
results.iloc[i,4]= precision_score(y_test, y_pred)
results.iloc[i,3] = roc_auc_score(y_test, y_prob)
results.round(3)