CS计算机代考程序代写 chain # Fitting finite mixture models in Stan

# Fitting finite mixture models in Stan

rm(list=ls()); set.seed(20210316);

glucose <- scan("~/Bayes/Data/glucose.txt") hist(glucose, freq=F, right=F) lines(density(glucose)) y <- glucose; rm(glucose); n <- length(y); n; library(rstan) stan_model <- " data{ int H;
int n;
real y[n];
}

parameters{
simplex[H] pi;
ordered[H] mu;
real sigma;
}

transformed parameters{
real sigma2;
sigma2 <- sigma^2; } model{ mu ~ normal(120, 100); sigma2 ~ inv_gamma(.01,.01); for(i in 1:n){ vector[H] lps = log(pi); for(h in 1:H) lps[h] += normal_lpdf(y[i] | mu[h], sigma); target += log_sum_exp(lps); } } " H <- 3; fit <- stan(model_code=stan_model, data=c("H", "n", "y"), iter=10000, chains=2) round(summary(fit)$summary[1:7,4:8], 2) sims <- extract(fit) names(sims) # This is what the homework problem asks you to do in part (b): # Assess the fit of the mixture model with parameters set to # their posterior means yvals <- seq(25, 225, 1) pi.hat <- apply(sims$pi, 2, mean) mu.hat <- apply(sims$mu, 2, mean) sigma.hat <- mean(sims$sigma) g.hat <- rep(NA, length(yvals)) for(j in 1:length(yvals)) { g.hat[j] <- sum(pi.hat * dnorm(yvals[j], mu.hat, sigma.hat)) } hist(y, freq=F, breaks=30, xlim=c(25,225)) lines(yvals, g.hat, col="red", lwd=2) lines(density(y), col="blue", lty=2, lwd=2) legend("topright", inset=.05, lty=1:2, col=c("red", "blue"), lwd=2, legend=c("Mixture of 3 normals", "Kernel density estimate")) # Part (c) of the homework problem calls for posterior predictive # checks against the sample mean, variance, skewness and kurtosis S <- length(sims$sigma); n <- length(y); y.rep <- matrix(NA, S, n); for(s in 1:S) { z <- sample(H, n, replace=T, prob=sims$pi[s,]) y.rep[s,] <- rnorm(n, mean=sims$mu[s,z], sd=sims$sigma[s]) } # My test statistics are mean, variance, skewness, kurtosis library(moments) T1 <- apply(y.rep, 1, mean) T2 <- apply(y.rep, 1, var) T3 <- apply(y.rep, 1, skewness) T4 <- apply(y.rep, 1, kurtosis) op <- par(mfrow=c(2,2)) p.val <- round(mean(T1 > mean(y)), 3);
hist(T1, freq=F, breaks=40, xlab=””, ylab=””,
main=paste(“PPD of Mean, p-value =”, p.val));
lines(density(T1), col=”red”);
abline(v=mean(y), lty=2, col=”blue”)

p.val <- round(mean(T2 > var(y)), 3);
hist(T2, freq=F, breaks=40, xlab=””, ylab=””,
main=paste(“PPD of variance, p-value =”, p.val));
lines(density(T2), col=”red”);
abline(v=var(y), lty=2, col=”blue”)

p.val <- round(mean(T3 > skewness(y)), 3);
hist(T3, freq=F, breaks=40, xlab=””, ylab=””,
main=paste(“PPD of skewness, p-value =”, p.val));
lines(density(T3), col=”red”);
abline(v=skewness(y), lty=2, col=”blue”)

p.val <- round(mean(T4 > kurtosis(y)), 3);
hist(T4, freq=F, breaks=40, xlab=””, ylab=””,
main=paste(“PPD of kurtosis, p-value =”, p.val));
lines(density(T4), col=”red”);
abline(v=kurtosis(y), lty=2, col=”blue”)

# Setting parameter values to their posterior means is not the
# best idea for assessing model fit. Better to compare the
# observed data to simulations from the posterior predictive
# distribution.

# We can compare the distributions in side-by-side plots, like
# we did for the location-scale mixture of two normals

y.tilde <- rep(NA, S); for(s in 1:S) { z <- sample(H, 1, prob=sims$pi[s,]) y.tilde[s] <- rnorm(1, mean=sims$mu[s,z], sd=sims$sigma[s]) } par(mfrow=c(1,2)) minny <- min(c(y, y.tilde)); maxie <- max(c(y, y.tilde)); hist(y, freq=F, right=F, breaks=20, main="Empirical distribution", xlim=c(minny, maxie), ylim=c(0, .016), xlab="Glucose (n=532)") lines(density(y)) hist(y.tilde, freq=F, right=F, breaks=20, main="Posterior predictive dist", xlim=c(minny, maxie), ylim=c(0, .016), xlab="Location only mixture (H=3)") lines(density(y.tilde))