rm(list=ls()); set.seed(20210325);
# The following R function takes as inputs the vector of response
# values y, the design matrix X, and the simulation size, for
# sampling from the posterior distribution p(beta, sigma | y) in
# Bayesian linear regression
# It returns as outputs a matrix of dimension Sim.size by k,
# containing the simulated beta-values; and a vector of length
# Sim.size, containing the simulated sigma-values
# Essentially it implements the calculation described at the
# middle of page 356 of Gelman et al: (1) Compute beta.hat,
# V.beta, and (2) s^2, using a standard regression program;
# (3) draw sigma^2 from its scaled inverse chi-square
# posterior; and (4) draw beta from its multivariate normal
# (conditional) posterior.
library(MASS)
Bayes.regression <- function(y, X, Sim.size) { n <- dim(X)[1]; k <- dim(X)[2]; m <- lm(y ~ 0 + X) beta.hat <- coef(m) s <- sigma(m) V.beta <- vcov(m) / s^2 sigma.sim <- s * sqrt((n-k) / rchisq(Sim.size, df=n-k)) beta.sim <- rep(beta.hat, each=Sim.size) + sigma.sim * mvrnorm(Sim.size, mu=rep(0,k), Sigma=V.beta) answer <- list(beta.sim=beta.sim, sigma.sim=sigma.sim) return(answer) } # Regression example, taken from "A First Course in Bayesian Statistical # Methods" by Peter D. Hoff # Arizona diabetes data # Read in data file <- "http://www2.stat.duke.edu/~pdh10/FCBS/Exercises/azdiabetes.dat" Data <- read.table(file=file, header=T); rm(file); dim(Data); names(Data); # Data pre-processing Sim.size <- 5000; y <- Data$glu; X <- as.matrix(Data[,c(1,3,4,5,6,7)]); rm(Data); names <- colnames(X); m <- apply(X, 2, mean); s <- apply(X, 2, sd); X <- sweep(X, 2, m, "-"); rm(m); X <- sweep(X, 2, s, "/"); rm(s); round(apply(X, 2, mean)); apply(X, 2, sd); X <- cbind(1, X); colnames(X) <- c("ones", names); rm(names); # Posterior sampling foo <- Bayes.regression(y=y, X=X, Sim.size=Sim.size); names(foo); beta.sim <- foo$beta.sim; sigma.sim <- foo$sigma.sim; rm(foo); probs <- c(.025, .25, .5, .75, .975) Results <- apply(beta.sim, 2, quantile, probs=probs) Results <- t(Results); Results <- rbind(Results, quantile(sigma.sim, probs=probs)) rownames(Results) <- c(colnames(X), "sigma") round(Results, 2) op <- par(mfrow=c(2,4)) for(j in 1:7) { hist(beta.sim[,j], freq=F, xlab="", ylab="", breaks=30, main=paste("Posterior dist of coef of", rownames(Results)[j])); lines(density(beta.sim[,j]), col="red") } hist(sigma.sim, freq=F, xlab="", ylab="", breaks=30, main="Posterior dist for error SD"); lines(density(sigma.sim), col="red") par(op) # Predictions?!?!? # Okay, let's suppose we wish to predict the response (glucose) for an # individual whose x-values are given by x.tilde <- X[21,]; x.tilde; # Easy enough y.tilde <- rnorm(Sim.size, mean=(beta.sim%*%x.tilde)[,1], sd=sigma.sim) hist(y.tilde, freq=F, breaks=30, xlab="", ylab="", main="Posterior predictive dist for y.tilde") lines(density(y.tilde), col="red") quantile(y.tilde, probs=probs)