CS计算机代考程序代写 algorithm # Consider the data and model from Homework 1 Problem 2

# Consider the data and model from Homework 1 Problem 2

# We have n=6 observations from BVN(0, 0, 1, 1, corr=theta) dist

# Assume a Uninform(0,1) prior on theta (known only that theta > 0)

# Key in data!

rm(list=ls()); set.seed(20210225);

x <- c(-3.3, +0.1, -1.1, +2.7, +2.0, -0.4) y <- c(-2.6, -0.2, -1.5, +1.5, +1.9, -0.3) n <- length(y) Sxx <- sum(x^2); Syy <- sum(y^2); Sxy <- sum(x*y); # Part (a): Find theta.hat_MAP using the optim R function # On Homework 1 we found the posterior mode by grid search. That # method works fine for this simple, one-dimensional problem with # a finite parameter space. # But for the sake of illustrating the use of optim, ... neg.log.q <- function(theta, Sxx, Syy, Sxy, n) { n/2 * log(1 - theta^2) + 1/(2 * (1 - theta^2)) * ( Sxx - 2 * theta * Sxy + Syy ) } # Recall that optim does *minimization* # In statistics we are usually most interested in maximizing a # function, in this case a posterior density # Maximizing q(theta) is equivalent to miniminizing -log[q(theta)] foo <- optim(0.5, fn=neg.log.q, method="L-BFGS-B", lower=0.000001, upper=0.999999, Sxx=Sxx, Syy=Syy, Sxy=Sxy, n=n) theta.hat.MAP <- foo$par; theta.hat.MAP; rm(foo); # We found theta.hat.MAP = 0.85 by grid search, so this result # seems consistent with that # Part (b): Back to grid approximations # Use 100 equally spaced values theta.vals <- seq(.005, .995, .01); length(theta.vals); log.q <- -1 * neg.log.q(theta.vals, Sxx, Syy, Sxy, n) maxie <- max(log.q); log.q <- log.q - maxie; rm(maxie); q <- exp(log.q); rm(log.q); plot(q ~ theta.vals, type="l") # unnormalized posterior # Now we can approximate E(theta|x,y) and E(theta^2|x,y) theta <- theta.vals; # rm(theta.vals); sum(theta * q) / sum(q); sum(theta^2 * q) / sum(q); rm(theta) # Part (c): Monte Carlo sampling from the grid theta.sim <- sample(theta.vals, 1000, replace=T, prob=q) theta.sim <- theta.sim + runif(1000, -.005, +.005) # 'jitter' hist(theta.sim, freq=F, right=F, breaks=30, xlim=c(0,1)); lines(density(theta.sim), col="red") # Here we should get about 0.77541 and 0.61356 mean(theta.sim); mean(theta.sim^2); # About the same as using quadrature, good # Part (d): Rejection sampling algorithm S <- 1000 # generate 1000 indepedent samples # We will use uniform(0,1) proposals, so g(theta) := 1 neg.log.M <- neg.log.q(theta.hat.MAP, Sxx, Syy, Sxy, n) theta.RS <- rep(NA, S) for(s in 1:S) { accept <- FALSE while(!accept) { theta.star <- runif(1) log.prob <- neg.log.M - neg.log.q(theta.star,Sxx,Syy,Sxy,n) accept <- ( log(runif(1)) < log.prob ) } theta.RS[s] <- theta.star; rm(theta.star); rm(log.prob); } hist(theta.RS, freq=F, right=F, breaks=30, xlim=c(0,1)) lines(density(theta.RS), col="red") # Here we should get about 0.77541 and 0.61356 mean(theta.RS); mean(theta.RS^2); # Pretty close to the grid approximation results, good # Part (e): Importance sampling # Here we simulate S random draws from the so-called importance # dist, in this case a Uniform(0,1). # We then re-weight them, using weights that are proportional to # q(theta.s) / g(theta.s) = (since g is uniform) = q(theta.s) theta.IS <- runif(S) # S random draws from the importance dist w <- exp(-neg.log.q(theta.IS, Sxx, Syy, Sxy, n)) # IS approximations to E(theta|x,y) and E(theta^2|x,y) # Here we should get about 0.77541 and 0.61356 sum(w * theta.IS) / sum(w); sum(w * theta.IS^2) / sum(w); # Pretty close to the grid approximation results, good # Part (f): Find the effective sample size for importance sampling? w.tilde <- w / sum(w); 1 / sum(w.tilde^2); # Based on this effective sample size, we conclude that 1000 draws # from the importance distribution gives comparable precision to # 331 or so draws from the target distribution. # It is interesting to compare the methods of importance sampling # and rejection sampling in the context of this simple example. # Suppose we took those 1000 draws from the importance distribution, # and only retained those which would have been accepted draws in an # implementation of rejection sampling. # How many of those 1000 proposals would have been accepted? # We use the same random samples, theta.IS, that for importance # sampling gave an effective sample size of 331. plot(q ~ theta.vals, type="l", col="blue"); segments(0,1,1,1, lty=2); U <- runif(S); points(theta.IS, U, cex=.75); accept <- ( log(U) < neg.log.M - neg.log.q(theta.IS,Sxx,Syy,Sxy,n) ) points(theta.IS[accept], U[accept], cex=.75, pch=19, col="red") sum(accept) # Using rejection sampling with a fixed number of proposals (not # the usual definition of rejection sampling but used here for # illustration), we find that 217 of the proposals would have been # accepted, so RS is less efficient than IS here # Again, here we should get about 0.77541 and 0.61356 mean(theta.IS[accept]); mean(theta.IS[accept]^2); # The picture helps us understand the difference between RS and IS # RS throws out the points corresponding to open circles, and only # keeps the points represented by solid red dots. # IS keeps all 1000 points, but re-weights them # What is the relative weight given to a particular theta value? # Precisely the height of the blue curve at that theta-value! # (More generally, the height of the blue curve (target density) # divided by that of the dashed black line (importance density). # In this case the importance density is that of Uniform(0,1)