# The aim of this session is to compare different models
# and to select the best one able to predict variable y
# we first load the dataset
menti = read.csv(“menti_200_full.csv”)
# before calibrating the models it is always
# output summary statistics
library(stargazer)
stargazer(menti, type = “text”)
# you can also output several plots to check
# if there are any outliers or the correlation and
# dependence between variables
# we then need to split the data between train and test
# train data will be used to calibrate the models
# I am using 75% of the samle as train
idx_train = sample(1:400, size = 300)
menti_train = menti[idx_train, ]
# the remaing 25% will be used
# to compare out-of-sample performance
menti_test = menti[-idx_train, ]
# we can start by estimating an ols model
ols_mod1 = lm(y ~ ., data = menti_train)
stargazer(ols_mod1, type = “text”)
# and we can construct our performance measures
# with the predict function
ols_pred1 = predict(ols_mod1, newdata = menti_test)
# which of these two measures penalizes more large errors
# vs smaller ones
# RMSE
sqrt(mean((ols_pred1 – menti_test$y)^2))
# MAE
mean(abs(ols_pred1 – menti_test$y))
# how would you construct in R a performance measure
# which uses absolute errors but penalises negative
# errors twice as much as positive ones?
# another useful model is the Elastic-Net
# from package glmnet
library(glmnetUtils)
# E-Net has two params: alpha and lambda
# alpha is set by the user and it is between 0 and 1
# alpha = 1 (default) gives the LASSO
# alpha = 0 gives the RIDGE
# any value in between is for a combination of the two
# which model are we estimating here?
enet_mod2 = glmnet(y ~ ., data = menti_train)
# stargazer does not support glmnet, yet
stargazer(enet_mod2, type = “text”)
# we can still use predict to compute out-of-sample performance
enet_pred2 = predict(enet_mod2, newdata = menti_test)
# which model between ols and E-Net performs better
# RMSE
sqrt(mean((enet_pred2 – menti_test$y)^2))
# MAE
mean(abs(enet_pred2 – menti_test$y))
# you can use function cv.glmnet to optimize lambda by
# cross validation but you still need to provide alpha
cv_enet_mod1 = cv.glmnet(x = as.matrix(menti_train[,-1]),
y = as.matrix(menti_train[,1]),
alpha = 0.5)
# therefore you need to use a loop by different alpha’s
# to pick the best one
# RMSE
sqrt(mean((cv_enet_mod1 – menti_test$y)^2))
# MAE
mean(abs(cv_enet_mod1 – menti_test$y))
# cva.glmnet will apply cv.glmnet by different alpha’s
# so that you can do the comparison after
cva_enet_mod1 = cva.glmnet(x = as.matrix(menti_train[,-1]),
y = as.matrix(menti_train[,1]))
for(alpha in cva_enet_mod1$alpha) {
cva_enet_pred1 = predict(cva_enet_mod1,
newx = as.matrix(menti_test[,-1]),
alpha = alpha)
cat(alpha, sqrt(mean((cva_enet_pred1 – menti_test$y)^2)), “\n”)
}