###MH Examples: Logistic Regression
library(MASS)
library(mvtnorm) #mutivariate normal density function and random variate generation
library(LearnBayes) #laplace function
library(arm) #logit, invlogit, display and bayesglm functions
#################################################################################
########## Logistic regression example #####################################
##################################################################################
##If not using a Rstudio project, unhash the line below and change to the
##name of the folder you are using.
#setwd(“~/Patrick/Stat314-2018”)
##read in additional functions to help with logistic
##regression
##The code below assumes a folder structure with sub-folders “R” for code
##and “data”. If you do not have this set-up you will need to modify
##the lines below appropriately.
source(“R/logisticregression_functions.r”)
readdata <- read.csv("data/nzis11-cart-surf.csv",header=TRUE)
str(readdata)
##To check functions have been read in:
logpost_logit_norm #code should appear in the console
mylaplace #code should appear in the console
###set-up variables
attach(readdata)
highincome <- as.numeric(readdata$income > 1250)
str(highincome)
mean(highincome)
agefactor <- relevel(as.factor(readdata$agegrp),"40")
regfactor <- relevel(as.factor(readdata$lgr),"2") #Auckland as ref
sexfactor <- relevel(as.factor(readdata$sex),"1") #male as ref
qualfactor <- relevel(as.factor(readdata$qualification),"2") #school quals as ref
table(qualfactor)
occfactor <- relevel(as.factor(readdata$occupation),"1")
table(occfactor)
obshours_centred <- readdata$hours-mean(readdata$hours)
obshours_cen40 <- readdata$hours-40
hourscut <- cut(readdata$hours,breaks=c(0,10,30,40,50,168),
include.highest=TRUE,right=FALSE)
table(hourscut)
hoursfactor <- relevel(hourscut,"[0,10)")
table(hoursfactor)
### Fit a basic version of your model
#assuming uniform prors for beta parameter. You need to replace
#the ... after ~ with your chosen model for the covariates. Check
# the glm syntax if you need to (?glm). If your outcome variable is something
#Other than highincome then that also needs to be replaced in the call to glm
#
logitmodel <- glm(highincome ~ ...,
family=binomial(link="logit") )
summary(logitmodel)
display(logitmodel) #Less cluttered display, without frequentist output
# such as p-values
##Optionally fit another model in glm, e.g. explore impact of
## controlling for additional covariates
##re-fit controlling for occupation
logitmodel2 <- glm(highincome ~ ... ,
family=binomial(link="logit") )
summary(logitmodel2)
display(logitmodel2)
###Note the code below assumes the name of the model you are using for
##posterior inference is "logitmodel." If you actually prefer logitmodel2
##You will either have to change the code below or set logitmodel
##to logitmodel2; e.g. logitmodel <- logitmodel2
###Specify priors:
###################################################################
##Analysis with an informative prior
##################################################################
##obtain priors for parameters of your preferred logitmodel
##using the makepriorfunctions makepriorb0() and makepriorbreg() read in from
#logisticregression_functions.r
##Intercept - specify prior interval on the probability scale
##makepriorb0 converts this to a prior on the logit scale for b0
##You need to specify the lower and upper bounds of your prior credibility
##interval and the credibility you attach to the interval
b0prior <- makepriorb0(low=...,high=...,credibility=...)
b0prior
##logistic regression parameters
##specify prior on odds ratio scale; makepriorreg converts to prior
#on the beta scale
bhours10_30 <- makepriorbreg(low=...,high=...,credibility=...)
bhours30_40 <- makepriorbreg(low=...,high=...,credibility=...)
bhours40_50 <- makepriorbreg(low=...,high=...,credibility=...)
bhours50pl <- makepriorbreg(low=...,high=...,credibility=...)
###add lines to set priors for additional parameters
###check a few of these
b0prior
bhours10_30
##combine prior means into a vector and construct a
#diagional prior variance matrix
##based on the prior standard deviations prior.matrix
#First put everything in matrix - column1= priormeans,
#column 2 = prior sd
prior.matrix<- rbind(b0prior,bhours10_30,bhours30_40,bhours40_50,
bhours50pl,...
) #... will correspond to the other parameters
#in your model
prior.matrix
prior_mean_inf <- prior.matrix[,1] #prior mean vector
prior_variance_inf <- diag((prior.matrix[,2])^2) #prior variance matrix
##print prior mean and covariance
prior_mean_inf
prior_variance_inf
#############################################################
#Next step is build an approximation to the posterior
#which we can use to generate starting values for the Markov Chains
#We can also base the variance of the jumping density on an
#approximation to the posterior variance
#We will use a multivariate normal approximation centred on the
# posterior model with variance determined by the curvature of the
#log-posterior at the mode. The posterior mode and approximate variance
#can be obtained using either the laplace() function or
#bayesglm(), given appropriate specification of the prior in both cases )
##test out the log posterior function: logpost_logit_norm()
##See logisticregression_functions.r for definition of this function
##Syntax is:
##logpost_logit_norm(beta=,
## prior_mean=
## prior_variance= )
## model=
##where beta is parameter value at which we want to evaluate the
##unnormalised log posterior, prior_mean is the prior mean for the model
##parameter (a vector of the same length as beta); prior_variance is
##is the prior variance matrix for the parameters, a k by k matrix where k
##is the length of beta; model is the logistic model which should be the
##nme of a glm object from the preliminary fit of the model.
bmle <- coef(logitmodel) #For convenience evaluate the unnormalised
#log posterior at the mle of the basic model
#called logitmodel
testpost <- logpost_logit_norm(beta=bmle,
prior_mean=prior_mean_inf,
prior_variance=prior_variance_inf,
model=logitmodel)
testpost
##use laplace to find the posterior mode and a variance approximation
## first argument passed to laplace is the name of the log-posterior function,
#then an initial guess at the mode --suggest using the mle from the glm);
#then all the arguments required by the log posterior function need to be
#explicitly passed to laplace()
laplace_post <- laplace(logpost_logit_norm,mode=...,
prior_mean=...,
prior_variance=...,
model=logitmodel)
##in the above call to laplace() the parameter mode just specified
##the starting values for the optimisation procedure; ideally an
##informed guessfor the posterior mode would be supplied
#Check convergence for the laplace procedure; i.e convergence
#of the numerical optimisation procedure for finding the mode.
laplace_post$converge ##This should equal TRUE otherwise
##there is a problem. For a complex model the
##solution may simply be to increase the number
##of iterations in laplace. This can achieved
##using the mylaplace() function in place of
##laplace.This has the same syntax, except
##for an additional
# parameter, maxiter. The default maximum number of iterations
#in mylaplace is set to
#1000, in contrast to laplace in which the maximium
#number of iterations is set to 500.
##Assuming convergence is ok we can extract approximations to the
##prior mean and variance.
mean_approx <- laplace_post$mode
var_approx <- laplace_post$var
sd_approx <- sqrt(diag(var_approx))
###display mode and approximation to variance
cbind(mean_approx,sd_approx)
##Obtain over-dispersed approx to variance
vstart <- ...
#Obtain jumping variance (Hint: use Gelman etal's suggestion)
jumpcov <- ... ##
###Set-up the M-H procedure
###decide on number of chains and length of the chains
nchains <- ... ##number of chains must be greater 1 but setting it too
##too high will slow computations between 3 and 5 is usually
##sufficient
nsim <- ... ##simulation sample size
###Specify array to store sampled values
storebeta <- array(NA,dim=c(length(mean_approx),nchains,nsim))
##For each parameter we have a nchains by nsims matrix
##To pull out the simulated points for a parameter k use storebeta[k,,];
#i.e storek <- storebeta[k,,].
#nchains by nsim matrix of acceptance indicators ##use code like bksims <- storebeta[k,,]
storeaccept <- matrix(NA,nrow=nsim,ncol=nchains)
###Start Metropolis-Hastings computations
system.time ( #not necessary to enclose the M-H within a system.time
#call but is of interest in the set-up phase
for (j in 1:nchains) {
##draw starting value for this chain e.g use rmvnorm to draw from
#a multivariate normal approximation
##
oldbeta <- rmvnorm(n=1,mean=...,sigma=...)
for (i in 1:nsim) {
storebeta[,j,i] <- oldbeta ##assuming the current draw is called oldbeta;
#generate a proposal from the jumping distribution -
#make this symmetric by using a multivariate normal
#using rmvnorm, with variance set to jumpcov
newbeta <- rmvnorm(n=1,mean=...,sigma=...)
##if we are using a symmetric proposal distribution so the acceptance ratio is just
## the ratio of the posterior at newbeta to the posterior at oldbeta
#on the logscale this is just the difference in the log posterior
##evaluate unnormalized posterior at proposal
logpost_prop <- logpost_logit_norm(beta=...,
prior_mean=...,
prior_variance=...,
model=logitmodel)
##evaluate unnormalized posterior at current value
logpost_old <- logpost_logit_norm(beta=...,
prior_mean=...,
prior_variance=...,
model=logitmodel)
##compute MH acceptance ratio or log MH acceptance ratio
##compute log MH acceptance ratio (log rMH)
logrMH <- ...
#decide whether to accept the proposal: create an acceptance indicator
#called accept
logU <- log(runif(1))
accept <- (logU < logrMH)
storeaccept[i,j] <- as.numeric(accept)
###If jump accepted update oldbeta to newbeta
if (accept) {
oldbeta <- newbeta
}
##else just stay at curent value of oldbeta
} ###end loop over simulations
} ###end loop over chains
) #end(system.time)
#Rhat statistics
##Simply loop over the parameters and carry out the computations
# as per Gelman et al
#decide on burn-in period
burnin <- ...
npos <- nsim - burnin ##size of retained posterior sample in each chain
#set-up a vector for storing the Rhat statistics
rhat <- vector(mode="numeric",length=length(mean_approx))
#set-up a vector for storing the effective sample sizes
neff <- vector(mode="numeric",length=length(mean_approx))
for (k in 1:length(bmle)) {
##subsets out values for ith parameter
betak <- storebeta[k,1:nchains,((burnin+1):nsim)] ##subsets out post burnin sample for kth parameter
chainmeans <- rowMeans(betak)
withinsd <- apply(betak,MARGIN=1,FUN="sd")
betweensd <- sd(chainmeans)
B = npos*betweensd^2
W = mean(withinsd^2)
varplus <- ((npos-1)/npos) * W + (1/npos)*B
rhat[k] <- sqrt(varplus/W)
neff[k] <- nchains*npos*(varplus/B)
}
rhat
neff ##effective sample size
##Traceplots
##useful but optional for assignment use code like this
##Assumes 3 chains and looking at just the 2nd parameter
##Really needs to be repeated for each parameter.
##You may also want to exclude the burnin
x <- seq(from=1,to=nsim,by=1)
plot(x,storebeta[2,1,],col="black",type="l",
main="Traceplot for hours10_30")
lines(x,storebeta[2,2,],col="red")
lines(x,storebeta[2,3,],col="blue")
###If satisfied about convergence use the post burn-in sample for posterior inference
possamp <- storebeta[,,(burnin+1):nsim] # this is our sample from the posterior
##produce posterior density plot for at least one of the logistic regression coefficients
##credible interval; posterior probability > 0.
##To subset out the posterior for a single
##parameters use code like pos_betak <- possamp[k,,];
#Functions like summary() and quantile() can then be applied
##to pos_betak and work happily even though it is an
#array rather than a vector
##Up to the user to know which parameters are of particular interest
k <- ... #Integer representing the element of the parameter vector you
#wish to focus on
posk <- possamp[k,,]
##get posterior probability parameter k > 0
…
##Produce a density plot
…
##Produce a density plot on the odds ratios scale
…
##Obtain 90% credible interval odds ratio corresponding to parameter k
…