CS计算机代考程序代写 ##tongue_cancer analysis

##tongue_cancer analysis
cancerdata <- read.csv("data/tongue_cancer.csv",header=TRUE) str(cancerdata) #only a small dataset so why not just print it to have a look cancerdata rates <- cancerdata$Cases / cancerdata$Population summary(rates) #may make more sense if expressed as rate per 100000 rates100000 <- rates*100000 summary(rates100000) rates.df <- data.frame(cancerdata$Demography,rates100000,rates, cancerdata$Cases,cancerdata$Population) names(rates.df) <- c("Dhb","rate_100000","rate","Cases") rates.df #What is the overall rate rawrate <- sum(cancerdata$Cases) /sum(cancerdata$Population) rawrate*100000 #3.634198 ## Basic analysis to estimate assumed common rate for all DHBs ##Simplistic model but but useful as base to compare against. ##Gamma prior a <- 3/100000 b <- 1 #Like saying prior evidence is equivalent to #one extra tiny DHB of size 1 totcases <- sum( cancerdata$Cases) totpop <- sum(cancerdata$Population) ##update to get parameters of the posterior, using conjugacy apost <- a + totcases bpost <- b + totpop ##Compute posterior summaries postmean <- apost/bpost post_median <- qgamma(0.5,shape=apost,rate = bpost) postmean * 100000 post_median * 100000 q025 <- qgamma(0.025,shape=apost,rate=bpost) q975 <- qgamma(0.975,shape=apost,rate=bpost) exact_quantiles <- 100000*c(q025,post_median,q975) exact_quantiles # Simulation approach - first look at modest size Monte Carlo sample post_lambda100 <- rgamma(n=100,shape=apost,rate=bpost) ##check quantiles post_quantiles100 <- quantile(post_lambda100,probs=c(0.025,0.5,0.975)) post_quantiles100 exact_quantiles #check_mean post_mean100 <- mean(post_lambda100) 100000*post_mean100 #3.662318 exact_mean <- apost / bpost 100000*exact_mean #3.634198 #check standard deviation post_sd100 <- sd(post_lambda100) ##Monte Carlo error for the posterior mean is MCerror <- post_sd100 / sqrt(100) MCerror #tiny #makes more sense multiplied by 100000, as per the rates themselves # Var(C\theta) = C^2 x Var(\theta) so # sd(C\theta) = C X sd(\theta) 100000 * MCerror #0.02695065, still pretty small #MC mean is about 1 MC standard error from the exact mean ##see what happens for a bigger posterior sample post_lambda1000 <- rgamma(n=1000,shape=apost,rate=bpost) post_quantiles1000 <- quantile(post_lambda1000,probs=c(0.025,0.5,0.975)) ##Compare true and simulation results exact_quantiles 100000*post_quantiles100 100000*post_quantiles1000 #tail quantiles looking pretty good by the time Monte Carlo #simulation size reaches 1000 #MC error for nsim=1000 post_sd1000 <- 100000*sd(post_lambda1000) post_sd1000 MC_error1000 <- post_sd1000/sqrt(1000) MC_error1000 post_mean1000 <- 100000*mean(post_lambda1000) post_mean1000 100000*exact_mean #So the MC mean is just over MC standard error from # the true mean #The MC error is fairly trivial though and represents # 1/sqrt(1000) = 3.2% of the posterior standard deviation # Monte Carlo for a more complex estimand and model ----------------------- #estimand is "thing to be estimated" #Instead of simple common rate model, let's go to the #other extreme and let each DHB have its own parameter; #probability that each DHB has highest rate among all DHBs. #Then we can ask questions like "what is the probability that each DHB #" has the highest underlying rate among all DHBs" #" How does a particular DHB rank in a 'league table' of rates by DHB # we will use the same prior for each DHB #\lambda ~ gamma(a,b) a = 3/100000; b=1 fulla_post <- a + cancerdata$Cases #vector fullb_post <- b + cancerdata$Population #vector fulla_post fullb_post ## rgamma is partially vectorised; Easiest to loop ## over simulations and on each iteration generate the vector of lambda ##lambda values for the 20 DHBs ## also need to work out the maximum and rank for each set of lambdas ##generated M <- 1000 ##number of draws from the posterior n <- length(rates) #number of groups - DHBs in this case ##Set-up structures for storing output post_fullambda <- matrix(nrow=M,ncol=n ) post_max <- matrix(nrow=M,ncol=n) post_rank <- matrix(nrow=M,ncol=n) for (i in 1:M ) { ##can probably draw gammas for all DHBs in one-hit fullambda <- rgamma(n,shape=fulla_post,rate=fullb_post) ranks <- rank(fullambda) ismax <- (ranks == max(ranks) ) post_fullambda[i,] <- fullambda post_rank[i,] <- ranks post_max[i,] <- ismax } ##check results ##posterior quantiles for each DHB fullpost_quantiles <- apply(post_fullambda,MARGIN=2,FUN=quantile, probs=c(0.025,0.5,0.975)) fullpost_quantiles.df <- data.frame(rates.df$Dhb,t(100000*fullpost_quantiles)) fullpost_quantiles.df <- cbind(fullpost_quantiles.df,rates.df$Cases) names(fullpost_quantiles.df) <- c("DHB","q025","q50","q975","cases") fullpost_quantiles.df ##posterior quantiles for each DHB's rank fullpost_ranks_quantiles.df <- apply(post_rank,MARGIN=2,FUN=quantile, probs=c(0.025,0.5,0.975)) fullpost_ranks_quantiles.df <- data.frame( t(apply(post_rank,MARGIN=2,FUN=quantile, probs=c(0.025,0.5,0.975)) ) ) fullpost_ranks_quantiles.df$Dhb <- rates.df$Dhb fullpost_ranks_quantiles.df names(fullpost_ranks_quantiles.df)[1:3] <- c("q025","q50","q975") fullpost_ranks_quantiles.df ##posterior probability that rate in each DHB is the maximum fullpost_max <- colMeans(post_max) fullpost_max.df <- data.frame(rates.df$Dhb,fullpost_max) names(fullpost_max.df) <- c("Dhb","prob") ##What about probability in the top 5 intop5 <- (post_rank >= 16)

Prtop5 <- colMeans((intop5) ) Prtop5.df <- data.frame(rates.df$Dhb,Prtop5) Prtop5.df