rm(list=ls()); set.seed(20210316);
glucose <- scan("~/Bayes/Data/glucose.txt") hist(glucose, freq=F, right=F) lines(density(glucose)) # Symmetric? No. Bell-shaped? Sort of. # Definitely not a normal distribution, but might # be well-approximated by mixture of two normals. # Update rules for Gibbs sampler (full conditionals) update.p <- function(a, b, n1, n2) { rbeta(1, a+n1, b+n2) } update.theta <- function(mu0, tau0, n, ybar, sigma) { V <- 1 / (1/tau0^2 + n/sigma^2) m <- V * (mu0/tau0^2 + n*ybar/sigma^2) rnorm(1, mean=m, sd=sqrt(V)) } update.sigma <- function(nu0, n, sigma0, y, theta) { df <- nu0 + n df.scale <- nu0*sigma0^2 + sum( (y-theta)^2 ) sqrt( df.scale / rchisq(1, df=df) ) } update.z <- function(p, y, theta1, theta2, sigma1, sigma2) { log.p1 <- log(p) + dnorm(y, mean=theta1, sd=sigma1, log=T) log.p2 <- log(1-p) + dnorm(y, mean=theta2, sd=sigma2, log=T) p <- exp(log.p1) / (exp(log.p1) + exp(log.p2)) 2 - rbinom(n=length(y), size=1, prob=p) } # Starting values for Gibbs sampler p <- .75 theta1 <- 100; sigma1 <- 20; theta2 <- 175; sigma2 <- 15; z <- update.z(p, y=glucose, theta1, theta2, sigma1, sigma2) p <- mean(z==1) theta1 <- mean(glucose[z==1]); sigma1 <- sd(glucose[z==1]); theta2 <- mean(glucose[z==2]); sigma2 <- sd(glucose[z==2]); hist(glucose, freq=F, right=F) # not shown curve(p*dnorm(x,theta1,sigma1)+(1-p)*dnorm(x,theta2,sigma2),add=T) # Mixture of normals may be reasonable here. # A thing you can do in problems where starting values are tough # to come by is: # (1) Bite the bullet and propose *something* (see above); then # (2) Run a preliminary chain, and take the ending state from that # run as the starting values for the real deal chain. # This is sometimes called "burn-in". a <- 1; b <- 1; mu0 <- 120; tau0 <- sqrt(200); nu0 <- 10; sigma0 <- sqrt(1000); for(s in 1:1000) { y1 <- glucose[z==1]; y2 <- glucose[z==2]; n1 <- length(y1); n2 <- length(y2); p <- update.p(a, b, n1, n2) theta1 <- update.theta(mu0, tau0, n1, mean(y1), sigma1) theta2 <- update.theta(mu0, tau0, n2, mean(y2), sigma2) sigma1 <- update.sigma(nu0, n1, sigma0, y1, theta1) sigma2 <- update.sigma(nu0, n2, sigma0, y2, theta2) z <- update.z(p, y=glucose, theta1, theta2, sigma1, sigma2) } hist(glucose, freq=F, right=F) curve(p*dnorm(x,theta1,sigma1)+(1-p)*dnorm(x,theta2,sigma2),add=T) # Hmm. Anyway let's run the real-deal Gibbs sampler. S <- 10000; Gibbs.chain <- matrix(rep(NA, S*5), S, 5); for(s in 1:S) { y1 <- glucose[z==1]; y2 <- glucose[z==2]; n1 <- length(y1); n2 <- length(y2); p <- update.p(a, b, n1, n2) theta1 <- update.theta(mu0, tau0, n1, mean(y1), sigma1) sigma1 <- update.sigma(nu0, n1, sigma0, y1, theta1) theta2 <- update.theta(mu0, tau0, n2, mean(y2), sigma2) sigma2 <- update.sigma(nu0, n2, sigma0, y2, theta2) z <- update.z(p, y=glucose, theta1, theta2, sigma1, sigma2) Gibbs.chain[s,] <- c(p, theta1, sigma1, theta2, sigma2) } Gibbs.chain <- data.frame(Gibbs.chain); rm(s); names(Gibbs.chain) <- c("p","theta1","sigma1","theta2","sigma2") rm(p); rm(theta1); rm(sigma1); rm(theta2); rm(sigma2); rm(z); rm(y1); rm(n1); rm(y2); rm(n2); attach(Gibbs.chain) mean(theta1 < theta2) # Good news, we don't have to mess with any label-switching. # Sample autocorrelation plots par(mfrow=c(2,2)) acf(theta1); acf(theta2); acf(sigma1); acf(sigma2); # Trace plots of every 20th update s <- seq(20, 10000, 20); plot(s, theta1[s], type="l"); plot(s, theta2[s], type="l"); plot(s, sigma1[s], type="l"); plot(s, sigma2[s], type="l"); apply(Gibbs.chain, 2, mean) # We estimate that the distribution of plasma glucose concentration # is a 62-38 mix of Normal(104, 18^2) and Normal(149, 27^2) # Effective sample sizes # Use method of batch means to estimate sampling variance of sample # mean, and back into effective sample size by setting that equal # to posterior variance over S_eff batch.means.mse <- function(chain, batch.size, n.batches) { batch.means <- rep(NA, n.batches) for(k in 1:n.batches) { batch.means[k] <- mean(chain[((k-1)*batch.size+1):(k*batch.size)]) } var(batch.means) / n.batches } # Effective sample sizes var(p) / batch.means.mse(p, batch.size=200, n.batches=50) var(theta1) / batch.means.mse(theta1, batch.size=200, n.batches=50) var(sigma1) / batch.means.mse(sigma1, batch.size=200, n.batches=50) var(theta2) / batch.means.mse(theta2, batch.size=200, n.batches=50) var(sigma2) / batch.means.mse(sigma2, batch.size=200, n.batches=50) # Part 4: Simulate posterior predictive distribution # (Just a single observation -- not a 'replicated data set' of n cases) y.tilde <- rep(NA, S) for(s in 1:S) { U <- runif(1) if(U < p[s]){ y.tilde[s] <- rnorm(1, mean=theta1[s], sd=sigma1[s]) } else { y.tilde[s] <- rnorm(1, mean=theta2[s], sd=sigma2[s]) } } # Compare empirical distribution to posterior predictive par(mfrow=c(1,2)) minny <- min(c(glucose, y.tilde)); maxie <- max(c(glucose, y.tilde)); hist(glucose, freq=F, right=F, breaks=20, main="Empirical distribution", xlim=c(minny, maxie), ylim=c(0, .016), xlab="Glucose (n=532)" ) lines(density(glucose)) hist(y.tilde, freq=F, right=F, breaks=20, main="Posterior predictive dist", xlim=c(minny, maxie), ylim=c(0, .016), xlab="Location-scale mixture (H=2)") lines(density(y.tilde)) # Of course the posterior predictive has a smoother appearance, # since it's based on 10000 draws versus the data sample size # of only 532 foo <- mean(y.tilde > 200); foo;
# The mixture model gives an over 1% chance of an observed
# value greater than 200, whereas the data didn’t have a
# single observation greater than 200.
(1 – foo)^532
# With n=532 that shouldn’t happen. So there’s a feature of the
# data the model’s not capturing.
# (The model is giving a longer right tail than there should be
# for these data.)
# Other than that the two-component normal mixture model seems to
# do a pretty good job of describing the data.