CS计算机代考程序代写 ## ——————————————

## ——————————————
##
## ST4060 / ST6015 / ST6040
## R script – Wedn 3 Nov 2021 lecture
## Eric Wolsztynski
##
## Overfitting and regularisation
##
## ——————————————

rm(list=ls())

library(glmnet)

# ————————————————————-
# (0) Motivation: overfitting

# (a) illustration of overfitting in 1D

dat = na.omit(airquality)
x = dat$Temp
y = dat$Ozone
par(font=2, font.lab=2, font.axis=2) # use bold fonts in figure
plot(x, y, pch=20, xlab=”Temperature (degrees F)”, ylab=”Ozone (ppb)”,
xlim=c(55,105))

dat = data.frame(x=x, y=y) # re-building data frame
lm.all = lm(y~., data=dat)

n = nrow(dat)
i.train = which(y<100) lm.train = lm(y~., data=dat[i.train,]) par(font=2, font.lab=2, font.axis=2) # use bold fonts in figure plot(x, y, pch=20, xlab="Temperature (degrees F)", ylab="Ozone (ppb)", xlim=c(55,105)) points(dat[i.train,], pch=20, cex=1.2, col=2) abline(lm.train, lwd=3, col=2) abline(lm.all, lwd=2, col=1, lty=2) lm.predict = predict(lm.train, newdata=dat[-i.train,]) sqrt(mean( (lm.predict-y[-i.train])^2 )) points(x[-i.train], lm.predict, pch='*', col='blue', cex=1.6) points(x[-i.train], y[-i.train], pch=10, col='blue', cex=1.6) # (b) illustration of overfitting in multivariate regression # Load some dataset dat = mtcars n = nrow(mtcars) R = 20 K = 2 fit = mse = numeric(K*R) fit.l = mse.l = numeric(K*R) folds = cut(1:n, K, labels=FALSE) set.seed(4060) for(r in 1:R){ # shuffle initial dataset dat = dat[sample(1:n,n,replace=FALSE),] # define x and y y = dat$mpg x = dat x$mpg = NULL # this to remove "y" from "x" xm = as.matrix(x) for(k in 1:K){ # split data into train and test sets i.train = which(folds!=k) # fit a gLM to the train set o = glm(y~., data=x, subset=i.train) # get fitted values yh = o$fitted.values # MSE from model fit fit[k+(r-1)*K] = mean((yh-y[i.train])^2) # generate predictions for test set from model fit yp = predict(o, newdata=x[-i.train,]) # test set prediction MSE (potentially much larger!?) mse[k+(r-1)*K] = mean((yp-y[-i.train])^2) cv.l = cv.glmnet(xm[i.train,],y[i.train],grouped=FALSE) lasso = glmnet(xm[i.train,],y[i.train],lambda=cv.l$lambda.min) yh.l = predict(lasso,newx=xm[i.train,]) fit.l[k+(r-1)*K] = mean((yh.l-y[i.train])^2) yp.l = predict(lasso,newx=xm[-i.train,]) mse.l[k+(r-1)*K] = mean((yp.l-y[-i.train])^2) } } boxplot(fit,mse,fit.l,mse.l, names=c("glm fit","glm error","lasso fit","lasso error"), ylim=c(0,25)) # ------------------------------------------------------------- # (1) Hitters example library(ISLR) set.seed(6026) dat = na.omit(Hitters) n = nrow(dat) dat$Salary = log(dat$Salary) REPS = 50 train.error = test.error = numeric(REPS) lasso.train.error = lasso.test.error = numeric(REPS) for(j in 1:REPS){ dat = dat[sample(1:n, n, replace=FALSE),] i.train = c(1:200) train = dat[i.train,] test = dat[-i.train,] fit = glm(Salary~., data=train) pred = predict(fit, newdata=test) train.error[j] = mean(fit$residuals^2) test.error[j] = mean((pred-test$Salary)^2) xm = model.matrix(Salary~., data=train)[,-1] newxm = model.matrix(Salary~., data=test)[,-1] y = train$Salary lam = cv.glmnet(xm, y, data=train) lasso = glmnet(xm, y, data=train, lambda=lam$lambda.min) lasso.pred = predict(lasso, newx=newxm) lasso.fit = predict(lasso, newx=xm) lasso.train.error[j] = mean((lasso.fit-train$Salary)^2) lasso.test.error[j] = mean((lasso.pred-test$Salary)^2) } summary(fit) coef(lasso) par(font=2, font.axis=2) boxplot(train.error, test.error, main="MLB player salary prediction", names=c("Training error", "Test error"), col=c('pink','cyan')) par(font=2, font.axis=2) boxplot(train.error, test.error, lasso.train.error, lasso.test.error, main="MLB player salary prediction", names=c("Training error", "Test error", "LASSO \ntraining error", "LASSO \ntest error"), col=c('pink','cyan','darkgreen','orange')) abline(h=median(train.error), lty=2, col='cyan') abline(h=median(lasso.train.error), lty=2, col='darkgreen') abline(h=median(test.error), lty=2, col='pink') abline(h=median(lasso.test.error), lty=2, col='orange') # ------------------------------------------------------------- # (2) Prestige example library(car) # contains dataset Prestige pairs(Prestige) # Let's simplify this dataset for this example... # ?Prestige # ... We want to model (predict) # income: Average income of incumbents, dollars, in 1971. # ... based on: # education: Average education of occupational incumbents, years, in 1971. # women: Percentage of incumbents who are women. # prestige: Pineo-Porter prestige score for occupation, # from a social survey conducted in the mid-1960s. y = Prestige$income x = Prestige[,c("education","prestige","women")] class(x) # note x is of class "data.frame" as opposed to "matrix" lmo = lm(y~., data=x) # this works though (check out ?lm) summary(lmo) # how to predict from this fit: newx = data.frame(education=seq(11,16,length=5), prestige=seq(14,80,length=5), women=seq(10,90,length=5)) lmo.pred = predict(lmo, newdata=newx) # regularized regression (LASSO, ridge, etc.) # NB: this one needs a little hack regarding format for x, if x is a data.frame lasso = glmnet(x, y, alpha=1) # does not work because x is a data.frame (= list) xm = model.matrix(y~., data=x)[,-1] class(x); class(xm) lasso = glmnet(xm, y, alpha=1) coef(lasso) # What's going on??? # Oh yeah, we need to pick one value for smoothing parameter lambda! lasso.cv = cv.glmnet(xm, y, alpha=1) lasso = glmnet(xm, y, alpha=1, lambda=lasso.cv$lambda.min) # let's compare LASSO with our earlier linear regression output: cbind(coef(lasso), coef(lmo)) # how to predict from this fit: # ?predict.glmnet lasso.pred = predict(lasso, newx=as.matrix(newx)) # NB: cbind(lasso.pred, lmo.pred) # close enough to the lm() predictions mean((lasso.pred-lmo.pred)^2) mean((lasso.pred-lmo.pred)/lmo.pred)*100 # less than 1% error... # ------------------------------------------------------------- # (3) Blood Pressure example library(glmnet) dat = read.table(file="data/Blood Pressure.txt", header=TRUE) str(dat) dat$PatientID = NULL # getting rid of this variable # GLM fit glm.fit = glm(Systolic~., data=dat) mean(glm.fit$residuals^2) # LASSO - note: fitting these models is rather cody... alpha = 1 # prepare the data: xm = model.matrix(Systolic~., data=dat)[,-1] y = dat$Systolic # compute best lambda: lam = cv.glmnet(xm, y, alpha=alpha) # c) fit model using best lambda: lasso = glmnet(xm, y, lambda=lam$lambda.min) # d) recover fitted values: lasso.fit = predict(lasso, newx=xm) # e) calculate RSS or MSE from LASSO mean((lasso.fit-y)^2) # Defining more user-friendly functions... # No need to be able to reproduce this. regularized.model <- function(x, response, alpha=1, ...){ # wrapping glmnet into a more user-friendly function # prepare the data if(!(response %in% names(x))){ stop("The response name provided is not included in the dataset") } fml = as.formula(paste(response, "~.")) xm = model.matrix(fml, data=x)[,-1] y = x[[response]] if(class(y)=="factor"){ if(length(levels(y))==2){ fam = "binomial" } else { fam = "multinomial" } } else { fam = "gaussian" } lam = cv.glmnet(xm, y, alpha=alpha, family=fam, ...) mod = glmnet(xm, y, alpha=alpha, lambda=lam$lambda.min, family=fam, ...) mod.fit = predict(mod, newx=xm) return(list(model=mod, fitted.values=mod.fit, y=response, family=fam)) } predict.regularized.model <- function(fit, test.data, ...){ fml = as.formula(paste(fit$y, "~.")) newx = model.matrix(fml, data=test.data)[,-1] return(predict(fit$model, newx, ...)) } set.seed(4060) ridge.fit = regularized.model(dat,"Systolic",alpha=0) enet.fit = regularized.model(dat,"Systolic",alpha=.6) lasso.fit = regularized.model(dat,"Systolic",alpha=1) effects = cbind(as.numeric(coef(glm.fit)), as.numeric(coef(ridge.fit$model)), as.numeric(coef(enet.fit$model)), as.numeric(coef(lasso.fit$model))) rownames(effects) = rownames(coef(ridge.fit$model)) colnames(effects) = c("GLM","ridge","e-net","LASSO") Errors = c(mean(glm.fit$residuals^2), mean((ridge.fit$fitted.values-y)^2), mean((enet.fit$fitted.values-y)^2), mean((lasso.fit$fitted.values-y)^2)) effects = data.frame(rbind(effects, Errors)) round(effects, 3)