## ——————————————
##
## ST4060 / ST6015 / ST6040
## R script – Wedn 20 Oct 2021 lecture
## Eric Wolsztynski
##
## ——————————————
## ——————————————
# Load some dataset
dat = mtcars
# shuffle it
n = nrow(mtcars)
dat = dat[sample(1:n, n), ]
# define x and y
y = dat$mpg
x = dat
x$mpg = NULL # this to remove “y” from “x”
# split data into train and test sets
i.train = c(1:floor(n/2))
# fit a gLM to the train set
o = glm(y~., data=x, subset=i.train)
# get fitted values
yh = o$fitted.values
# compare fitted with “true” values (the observations)
plot(yh,y[i.train])
boxplot((yh-y[i.train])^2)
# MSE from model fit
mean((yh-y[i.train])^2)
# generate predictions for test set from model fit
yp = predict(o, newdata=x[-i.train,])
# test set prediction MSE (potentially much larger!?)
mean((yp-y[-i.train])^2)
# now bootstrap this process, using OOB (out-of-bootstrap)
# sample points as test points
B=10
mseb = pred.errb = numeric(B)
for(b in 1:B){
# pick the boostrap resample points
i.train = sample(1:n,n,replace=TRUE)
o = glm(y~., data=x, subset=i.train)
yh = o$fitted.values
# boostrapped model fit MSE’s
mseb[b] = mean((yh-y[i.train])^2)
# compute out-of-bootstrap predictions
i.oob = unique(c(1:n)[-i.train])
yp = predict(o, newdata=x[i.oob,])
# corresponding boostrapped error MSE’s
pred.errb[b] = mean((yp-y[i.oob])^2)
}
boxplot(mseb, pred.errb)
# k-fold CV
K=5 # using 5 folds
mse = pred.err = numeric(K)
folds = cut(1:n, K, labels=FALSE)
for(k in 1:K){
# pick all folds but the k-th fold for training
i.train = which(folds!=k)
o = glm(y~., data=x, subset=i.train)
yh = o$fitted.values
# corresponding model fit MSE
mse[k] = mean((yh-y[i.train])^2)
# pick k-th fold for testing
i.test = which(folds==k)
yp = predict(o, newdata=x[i.test,])
# corresponding prediction error estimate
pred.err[k] = mean((yp-y[i.test])^2)
}
# compare distributions and comment:
boxplot(mse, pred.err, mseb, pred.errb)