## ——————————————
##
## ST4060 / ST6015 / ST6040
## R script – Fri 19 Nov 2021 lecture
## Eric Wolsztynski
##
## Splines
##
## ——————————————
rm(list=ls())
datadir = “/Volumes/GoogleDrive/My Drive/teaching/st4060/data/”
library(splines)
# ————————————————————
# QUESTION 4.1
LT = read.table(paste(datadir,”irl_lifetable_2005.txt”,sep=”),
sep=”,”,header=TRUE)
head(LT)
# keep only the first 106 rows from LT:
SLT = LT[c(1:106),]
mx = SLT[,8]
x = SLT$Age # age grid
# roughly fit a Makeham model to this data:
onls = nls(mx~A+B*c^x,start=list(A=.0003, B=.00002, c=1.108))
ABc = summary(onls)$coef[,1]
# now add noise to the fitted f.o.m. curve:
set.seed(1)
x = seq(0, 110, by=1)
mx = ABc[1]+ABc[2]*ABc[3]^x
mxn = mx
s1 = which(x<86)
s2 = which(x>85)
mxn[s1] = pmax(0.005,mx[s1]+rnorm(length(s1),0,.03))
mxn[s2] = mx[s2]+rnorm(length(s2),0,.06)
dat = data.frame(x,mx,mxn)
x = dat$x
mxn = dat$mxn
plot(x,mx,pch=21,col=8)
points(x,mxn,pch=20,cex=.8)
# fit NLS:
nls.ns = nls(mxn~A+B*c^x,start=list(A=.0003, B=.00002, c=1.108))
ABc.ns = summary(nls.ns)$coef[,1]
# smooth the raw data and fit NLS then:
pspl = smooth.spline(x, mxn)
sy = pspl$y
nls.ps = nls(sy~A+B*c^x,start=list(A=.0003, B=.00002, c=1.108))
ABc.ps = summary(nls.ps)$coef[,1]
cbind(ABc,ABc.ns,ABc.ps)
((ABc-ABc.ns)^2)
((ABc-ABc.ps)^2)
((ABc-ABc.ns)/ABc)
((ABc-ABc.ps)/ABc)
# ————————————————————
# QUESTION 4.2
dat = read.csv(paste(datadir,”insdata.csv”,sep=”))
age = dat$Age
mF = dat$mF
# (a)
p1 = smooth.spline(age, mF, cv=TRUE)
plot(age, mF, pch=20, t=’b’)
lines(p1, col=2, lwd=3)
# (b)
par = p1$spar
p2 = smooth.spline(age, mF, spar=par/2)
lines(p2, col=4, lwd=3)
# (c)
# ?smooth.spline
p1$x-p2$x
plot(p1$x, p2$x); abline(a=0, b=1)
# (d) NB: we need to check that the P-spline data is ordered
# the same way as the original data; it’s the case here
# because the original sample was already ordered, but this
# is not always true…
p1$x-age
mse1 = mean( (mF-p1$y)^2 )
mse2 = mean( (mF-p2$y)^2 )
c(mse1, mse2)
sqrt(c(mse1, mse2)) # maybe easier to compare RMSEs instead?
# Explain the difference?
c(sd(p1$y-mF), sd(p2$y-mF))
c(mean(p1$y-mF), mean(p2$y-mF))
# undersmooth >> tighter fit in this case…
# can we explain? (hint: it has to do with the data…)
# (e)
knots = quantile(age, c(.25,.50,.75))
BM = bs(age, knots=knots)
matplot(age,BM,xlab=”Age”,ylab=”Spline basis”,t=’b’)
attributes(BM)
# Note: df = length(knots) + degree… cf. notes!!
# (f) coordinates of age 60 on B-spline basis:
round(BM[which(age==60),],4)
matplot(age,BM,xlab=”Age”,ylab=”Spline basis”,t=’b’)
abline(v=60, lty=3)
abline(h=BM[which(age==60),], lty=2)
# (g) corresponding B-spline for (age, mF) data
bsp = lm(mF~BM)
coef(bsp)
summary(bsp)
# (h)
mseb = mean((fitted(bsp)-mF)^2)
sqrt(c(mse1, mse2, mseb))
plot(age,mF,t=’b’,pch=20,main=”Crude force of mortality (Females)”,
xlab=”Age (years)”,ylab=”Force of mortality”,cex=1.2)
points(p1,col=2,t=’l’,pch=’x’,lwd=2)
points(age,fitted(bsp),col=3,t=’l’,pch=’x’,lwd=4)
points(p2,col=4,t=’l’,pch=’x’,lwd=2)
# This B-spline seems comparable to the 1st P-spline, and
# not as effective as the second one.
# (i)
# first, see that we’re missing some ages:
plot(age)
# interpolation grid:
all.ages = seq(min(age),max(age),by=1)
plot(all.ages)
# – P-spline
sp1 = approx(p1,xout=all.ages)
sum((sp1$y)^2)
sd(sp1$y)
# – lowess
lo = loess(mF~age)
lpred = predict(lo,newdata=data.frame(age=all.ages))
sd(lpred)
plot(age,mF,t=’b’,pch=20,main=”Crude force of mortality (Females)”,
xlab=”Age (years)”,ylab=”Force of mortality”,cex=1.2)
points(all.ages,sp1$y,col=2,t=’b’,pch=’x’,lwd=2)
points(all.ages,lpred,col=4,t=’b’,pch=20,lwd=2)
points(age,mF,pch=20)
# ————————————————————
# QUESTION 4.3
library(splines)
library(car)
head(Prestige)
x = Prestige$income
y = Prestige$prestige
plot(x, y, pch=20)
# polynomial regression (for comparison)
inc.100 <- seq(min(x), max(x), len=100)
mod.lo.inc <- loess(y ~ x, span=.7, degree=1)
names(mod.lo.inc)
points(mod.lo.inc$x, mod.lo.inc$fitted, pch=15, col=2)
# show the location of the "new points" on graph:
rug(inc.100)
# generate predictions (note that we need to call the
# points "x" in the data.frame of new evaluation points,
# so as to match the name of the predictor variable in
# the loess() call that yielded mod.lo.inc)
pres <- predict(mod.lo.inc, newdata=data.frame(x=inc.100))
# display these points
points(inc.100, pres, pch=20, cex=.8, col=4)
# Compute a P-spline
ssp <- smooth.spline(x, y)
# Compute a B-spline
B = bs(x, df=6)
sbp <- lm(y~B)$fitted
# plot(x, y, pch=20)
lines(ssp, lwd=3, col=4)
reorder = order(x)
points(x[reorder], sbp[reorder], pch=20, col=2, t='b', lwd=4)
reorder = order(mod.lo.inc$x)
lines(mod.lo.inc$x[reorder], mod.lo.inc$fitted[reorder],
lwd=3, col=8)
legend("bottomright", pch=c(20), col=c(1,2,3,8), bty='n',
legend=c("data","P-spline","B-spline",
"polynomial regression"))
plot(x, y, pch=20)
lines(ssp, lwd=3, col=4)
reorder = order(x)
points(x[reorder], sbp[reorder], pch=20, col=2, t='b', lwd=4)
# Now practice calculating the MSEs for these curves...
length(x)
length(y)
length(sbp)
length(ssp$y) # not the original N!!!
plot(sbp, y)
mse.b = mean( (sbp-y)^2 )
# For the P-spline, we need to interpolate!
spp = approx(ssp$x, ssp$y, xout=x, rule=2)$y
mse.p = mean( (spp-y)^2 )
c(mse.b, mse.p)