##
## ST4060 / ST6015 / ST6040
## Tutorial exercises – Section 2
## Eric Wolsztynski
##
# ————————————————————
# Question 2.1
M <- 10000
x <- runif(M, min=2, max=4)
theta.hat <- mean( exp(-x) ) * (4-2)
print(theta.hat)
print(exp(-2) - exp(-4))
# ------------------------------------------------------------
# Question 2.2
M <- 10000
G <- 10
x <- seq(0.1, 2.5, length=G)
cdf <- numeric(G)
vmc <- numeric(G) # variance of MC estimator
ciU <- numeric(G) # Upper bound of CI for MC estimator
ciL <- numeric(G) # Lower bound of CI for MC estimator
for(i in 1:G){
u <- runif(M, min=0, max=x[i])
g <- exp(-u^2/2)
# now compute the adjusted sample average: gbar * (x-0)
# (remember to add the negative part of the integral!)
cdf[i] <- x[i] * mean(g) / sqrt(2*pi) + 0.5
# compute the associated variance
vmc[i] <- mean( (g-mean(g))^2 ) / M
# compute the 95% CI
ciL[i] <- cdf[i] - 1.96*sqrt(vmc[i])
ciU[i] <- cdf[i] + 1.96*sqrt(vmc[i])
}
# Check with theoretic values:
Phi <- pnorm(x)
print( round(rbind(x, cdf, Phi), 3) )
# Display evaluated CI:
print( cbind(round(x,2), round(cbind(ciL, cdf, ciU),4)) )
# Note that something should be done to truncate the CI within (0,1)!
# ------------------------------------------------------------
# Question 2.3
N = 100 # sample size
k = 5
M = 1000 # number of MC loops
# initialise the storing vectors
xbar = tmean = numeric(M)
# MC loop:
for(i in 1:M){
x = rnorm(N, df=10)
xbar[i] = mean(x)
xs = sort(x)
tmean[i] = mean( xs[(k+1):(N-k)] )
}
rr = range(c(xbar,tmean))
par(mfrow=c(2,1))
hist(xbar, xlim=rr)
hist(tmean, xlim=rr)
# plot(xbar,tmean)
# Biases:
# mean(xbar-15)
# mean(tmean-15)
# Variances:
sd(xbar)
sd(tmean)
# MSEs:
# mean((xbar-15)^2)
# mean((tmean-15)^2)
# ------------------------------------------------------------
# Question 2.4
# Here the objective is to set up M repetitions of an experiment (Monte Carlo
# repetitions), where for each experiment:
# - we generate random values,
# - do some statistical stuff with it(*),
# - store the results.
# Finally, we analyse these results once the M experiments are finished.
# We'll eventually generate some plots and store some information...
# (*) Here the 'statistical stuff' could be comparing two robust estimators
# for linear regression with heavy-tailed noise (e.g. log-Normal, Laplace)...
# We could e.g. decide to use least squares (LS) and robust M-estimation (RM).
#-------------------------------------------------------------------------
# 0) Load our own work! We're also going to use mylm (script #3)
getwd()
setwd("/Volumes/macdata/teaching/ST3060/practicals")
source("ST3060_toolbox.R") # import all functions defined in the file
# 1) Initialisation of vectors and parameters
N = 50 # Sample size
M = 100 # Number of Monte-Carlo repetitions
a = 7 # Intercept parameter (to be estimated)
b = 3 # Slope parameter (to be estimated)
x = rep(c(1:5),N/5) # Vector of regressors (design)
m = 0.5 # Location for noise distribution
s = 1.2 # Scale for noise distribution
rseed = 0 # Random seed
LSvec = RMvec = matrix(0,2,M) # Storage
MYvec = LSvec # More storage
ev = yv = matrix(0,N,M) # Even more storage
set.seed(rseed) # Sets random seed for reproducibility
library(MASS) # We need rlm() from this library
library(VGAM) # We need rlaplace() from this library
# 2) Monte-Carlo experiments
for (mc in 1:M){
# generate data
#e = rlaplace(N,location=m,scale=s)
e = rlnorm(N,meanlog=m,sdlog=s)
y = a + b*x + e
# estimate (a,b) via Least Squares estimation
LS = lm(y~x)$coef
# estimate (a,b) via Least Squares estimation, using our own mylm()
MY = c(mylm(x,y)$myintercept,mylm(x,y)$mycoef)
# estimate (a,b) via Robust M-estimation
RM = rlm(y~x,method="MM")$coef
# store generated data
yv[,mc] = y
ev[,mc] = e
# store estimates for this Monte-Carlo experiment
LSvec[,mc] = rbind(LS[1],LS[2])
MYvec[,mc] = rbind(MY[1],MY[2])
RMvec[,mc] = rbind(RM[1],RM[2])
}
# 3) Analyse the two sets of estimates
# distributions
par(mfrow=c(2,3))
#
hist(LSvec[1,],main="Estimates of a by LS");
hist(RMvec[1,],main="Estimates of a by RM")
plot(density(LSvec[1,]),col='red'); points(density(RMvec[1,]),t='l',col='blue')
#
hist(LSvec[2,],main="Estimates of b by LS");
hist(RMvec[2,],main="Estimates of b by RM")
plot(density(LSvec[2,]),col='red'); points(density(RMvec[2,]),t='l',col='blue')
# bias: intercept
mean(LSvec[1,])-a; mean(MYvec[1,])-a; mean(RMvec[1,])-a
# bias: slope
mean(LSvec[2,])-b; mean(MYvec[2,])-b; mean(RMvec[2,])-b
# variance: intercept
var(LSvec[1,]); var(MYvec[1,]); var(RMvec[1,])
# variance: slope
var(LSvec[2,]); var(MYvec[2,]); var(RMvec[2,])
# MSE: intercept (MSE = Bias(estimates)^2 + Var(estimates))
mean((LSvec[1,]-a)^2); mean((MYvec[1,]-a)^2); mean((RMvec[1,]-a)^2)
# MSE: slope
mean((LSvec[2,]-b)^2); mean((MYvec[2,]-b)^2); mean((RMvec[2,]-b)^2)
# 4) Write outputs to file
allres = cbind(x,t(MYvec),t(LSvec),t(RMvec))
results = data.frame(allres)
names(results) = c("x","a_MY","b_MY","a_LS","b_LS","a_RM","b_RM")
head(results)
write.csv(results,"montecarlo.csv")
# 5) Test this file
mydata = read.csv("montecarlo.csv")
head(mydata)
# We should get the same stats as before again!
# Check bias of intercept estimates
mean(mydata$a_LS)-a; mean(mydata$a_MY)-a; mean(mydata$a_RM)-a
mean(LSvec[1,])-a; mean(MYvec[1,])-a; mean(RMvec[1,])-a
# ------------------------------------------------------------
# Question 2.5
# NOTE: we still need to adapt this code to run the analysis
# for varying sample sizes 🙂
N = 50 # sample size
M = 1000 # number of MC loops
# initialise the storing vectors
sd1 = sd2 = numeric(M)
# MC loop:
for(i in 1:M){
x = rnorm(N, mean=10, sd=2)
sd1[i] = sd(x)
sd2[i] = sqrt(mean((x-mean(x))^2))
}
# Evaluate biases:
# (MC can be used to approximate the bias numerically)
true = 2 # true value of sd
(mean(sd1)-true)
(mean(sd2)-true)
# %-errors give a better idea:
(mean(sd1)-true)/true
(mean(sd2)-true)/true
# Evaluate accuracies:
sd(sd1)
sd(sd2)
# they seem comparable; the real difference is in terms of bias
# ------------------------------------------------------------
# Question 2.6
rm(list=ls())
M = 100
N = 100
dfs = c(2,4,10)
means = matrix(NA, nrow=M, ncol=length(dfs))
for(j in 1:length(dfs)){
df = dfs[j]
# MC loop:
for(i in 1:M){
x = rchisq(N, df=df)
means[i,j] = mean(x)
}
}
boxplot(means)
abline(h=dfs, lwd=4, col='pink')
hist(means)
apply(means,2,sd)
# ------------------------------------------------------------
# Question 2.7
rhuber <- function(N,epsilon=0,dof=3){
# Generates N pseudo-random realizations of Huber's contamination model
# f(u) = epsilon g(u) + (1-epsilon) h(u)
# using the Standard Normal distribution for g(.) and the t-distribution
# with 3 degrees of freedom for h().
# Here epsilon is expected to be within [0.5, 1].
# to randomly sample from either distribution:
if(epsilon<0 | epsilon>1){
stop(“epsilon must be a value within 0 and 1.”)
}
draws = runif(N,0,1)
# initialise output vector of realisations:
if(epsilon<1){
x = numeric(N)
i.h = which(draws<(1-epsilon))
x[i.h] = rt(length(i.h),dof)
x[-i.h] = rnorm((N-length(i.h)))
} else {
x = rnorm(N)
}
return( x )
}
set.seed(1)
N = 100
f1 = rhuber(N,.95)
f2 = rhuber(N,.80)
f3 = rhuber(N,.60)
round(c(mean(f1), sd(f1)),3)
round(c(mean(f2), sd(f2)),3)
round(c(mean(f3), sd(f3)),3)
par(mfrow=c(3,1))
xlims = range(c(f1,f2,f3))
hist(f1, col='navy', xlim=xlims, breaks=20)
hist(f2, col='navy', xlim=xlims, breaks=20)
hist(f3, col='navy', xlim=xlims, breaks=20)
df = data.frame(cbind(f1,f2,f3))
names(df)=c("e095","e080","e060")
nrow(df); head(df)
write.csv(df, "output_of_Q1.csv", row.names=FALSE)
M=500
N=100
mean.f = mean.g = sd.f = sd.g = numeric(M)
for(mc in 1:M){
f1 = rhuber(N,.60)
g1 = rnorm(N)
mean.f[mc] = mean(f1)
mean.g[mc] = mean(g1)
sd.f[mc] = sd(f1)
sd.g[mc] = sd(g1)
}
round(c(mean(mean.f), mean(sd.f)),3)
round(c(mean(mean.g), mean(sd.g)),3)