## ——————————————
##
## ST4060 / ST6015 / ST6040
## R script – Fri 22 Oct 2021 lecture
## Eric Wolsztynski
##
## ——————————————
# ————————————————
# Tutorial Question 2.8
library(bootstrap)
plot(law, pch=20, cex=2)
abline(lm(law$GPA~law$LSAT), col=3, lwd=4)
cor(law$LSAT,law$GPA)
B = 1000
bcor = numeric(B)
n = nrow(law)
for(b in 1:B){
ib = sample(1:n, size=n, replace=TRUE)
xb = law$LSAT[ib]
yb = law$GPA[ib]
bcor[b] = cor(xb,yb)
}
mean(bcor) – cor(law$LSAT,law$GPA)
sd(bcor)
quantile(bcor, c(.025,.975))
# ————————————————
# Tutorial Question 2.9
plot(cars, pch=20)
x = cars$speed
y = cars$dist
lmo = lm(y~x)
lmo$coef # original coef estimates
abline(lmo, col=3, lwd=3)
n = nrow(cars)
B = 10000
bcoef = matrix(NA, nrow=B, ncol=2)
for(b in 1:B){
# WRONG:!!
# xb = sample(x, replace=TRUE)
# yb = sample(y, replace=TRUE)
# RIGHT:!!
ib = sample(1:n, replace=TRUE)
xb = x[ib]
yb = y[ib]
lmb = lm(yb~xb)
bcoef[b,] = coef(lmb)
}
par(mfrow=c(1,2))
hist(bcoef[,1], col=’pink’, main=’Intercepts’)
hist(bcoef[,2], col=’pink’, main=’Slopes’)
lmo$coef[2]
mean(bcoef[,2]) – lmo$coef[2] # bias estimate (slope)
sd(bcoef[,2])
quantile(bcoef[,2], c(.025,.975))
# (d)
library(ellipse)
plot(bcoef, pch=20)
e = ellipse(lmo)
plot(e, t=’l’)
points(bcoef, pch=20, col=8)
# ————————————————
# Demo: how to use boot::boot()
# (because most of you end up trying)
library(boot)
my.mean <- function(x, ib){ return( mean(x[ib]) ) } # bootstrap using boot(): B = 100 x = cars$dist ob = boot(x, my.mean, R=B) # or by hand: # (results will differ because of random # sampling) (xbar = mean(x)) n = length(x) bmean = numeric(B) for (b in 1:B){ xb = sample(x, replace=TRUE) bmean[b] = mean(xb) } mean(bmean)-xbar sd(bmean) # ------------------------------------------------ # Tutorial Question 2.10 x = mtcars$disp y = mtcars$mpg plot(x,y,pch=20) nlso = nls(y~exp(a+b*x), start=list(a=3,b=-.01)) plot(fitted(nlso), y, pch=20, cex=2) plot(fitted(nlso)- y, pch=20, cex=2) hist(fitted(nlso)- y, col='pink') plot(x, y, pch=20, cex=2) points(x, fitted(nlso), pch=20, cex=2, col=4) is = order(x) lines(x[is], fitted(nlso)[is], pch=20, cex=2, col=4, lwd=5) # ------------------------------------------------ # Tutorial Question 2.11 library(MASS) n = nrow(Boston) dat = Boston[sample(1:n,n),] x = dat[,c('crim','indus','rm','tax')] y = dat$medv lmo = lm(y~., data=x) summary(lmo) n = nrow(x) # Repeated k-fold framework: R = 10 K = 5 errsR = NULL folds = cut(1:n, K, labels=FALSE) for(r in 1:R){ # shuffle the dataset every time # a k-fold CV is initiated: # (otherwise we re-use exactly the same # fold!!) is = sample(1:n,n) x = x[is,] y = y[is] # then perform k-fold CV: for(k in 1:K){ i.train = which(folds!=k) lmo = lm(y~., data=x, subset=i.train) i.test = which(folds==k) preds = predict(lmo, newdata=x[i.test,]) errsR = c(errsR, mean( (y[i.test]-preds)^2 )) } } K = 5 errs.5 = numeric(K) folds = cut(1:n, K, labels=FALSE) for(k in 1:K){ i.train = which(folds!=k) lmo = lm(y~., data=x, subset=i.train) i.test = which(folds==k) preds = predict(lmo, newdata=x[i.test,]) errs.5[k] = mean( (y[i.test]-preds)^2 ) } K = 10 errs.10 = numeric(K) folds = cut(1:n, K, labels=FALSE) for(k in 1:K){ i.train = which(folds!=k) lmo = lm(y~., data=x, subset=i.train) i.test = which(folds==k) preds = predict(lmo, newdata=x[i.test,]) errs.10[k] = mean( (y[i.test]-preds)^2 ) } boxplot(errsR,errs.5,errs.10,names=c("R",5,10))