程序代写代做 graph algorithm Inference¶

Inference¶

Estimators¶
In [ ]:
# assume population is Normal(180,5)
# E[X]=180

ran1 <- rnorm(20, 180, 5) ran1 In [ ]: est1 <- mean(ran1) est2 <- min(ran1)+diff(range(ran1))/2 In [ ]: est1 est2 In [ ]: 180 - est1 180 - est2 In [ ]: est1 <- est2 <- c() for (i in 1:100) { ran1 <- rnorm(20, 180, 5) est1[i] <- mean(ran1) est2[i]<-min(ran1)+diff(range(ran1))/2 } In [ ]: plot(x=1:100,y=est1,type="p",col="red") points(x=1:100,y=est2,col="blue") abline(h=180,lty=2) In [ ]: # bias mean(est1) - 180 mean(est2) - 180 In [ ]: (mean(est1)-180)^2 (mean(est2)-180)^2 Likelihood¶ In [ ]: # a random sample of X from a population X <- as.numeric(readLines("sample20.txt")) X In [ ]: # descriptive statistics # location mean(X) median(X) range(X) In [ ]: # scale var(X) quantile(X) In [ ]: ## graphic summary hist(X) In [ ]: ## frequentist estimate of the expected value mean(X) Likelihood¶ 1) choose a statistical model: e.g. Normal distribution with unknown $\mu$ and known $\sigma$: $N(\mu, \sigma$) In [ ]: # dnorm(..., mean=mu, sd=sigma) 2) define sample space of X and its pdf and cdf In [ ]: x <- seq(160,190,0.01) In [ ]: # pdf e.g. assuming mu=170 plot(x,y=dnorm(x, mean=170, sd=5), type="l") In [ ]: # cdf e.g. assuming mu=170 plot(x,y=pnorm(x, mean=170, sd=5), type="l") 3) $P(D|\mu=170, \sigma=5)$ In [ ]: # cdf pnorm(160,mean=170,sd=5) pnorm(170,mean=170,sd=5) pnorm(180,mean=170,sd=5) In [ ]: # pdf dnorm(160,mean=170,sd=5) dnorm(170,mean=170,sd=5) dnorm(180,mean=170,sd=5) In [ ]: # but we are interested in the inverse problem! # e.g. if D=175.1 dnorm(175.1, mean=170, sd=5) In [ ]: dnorm(175.1, mean=160, sd=5) dnorm(175.1, mean=180, sd=5) In [ ]: # for any observed data, we consider P(D|theta) as a function of theta thetas <- seq(160, 190, 0.01) In [ ]: # e.g. the Likelihood of observing the first value in X given that theta=160 is dnorm(x=thetas[1] , mean=X[1], sd=5) In [ ]: # if we have more data, then the likelihood is the product across all data points, # given a certain value of theta dnorm(x=thetas[1], mean=X[1:10], sd=5) In [ ]: prod(dnorm(x=thetas[1], mean=X, sd=5)) In [ ]: # calculate the Likelihood for each possible value of theta L <- c() for (theta in thetas) L <- c(L, prod(dnorm(X,theta,5))) In [ ]: plot(x=thetas, y=L, xlab="theta", ylab="likelihood", type="l") 4) calculate MLE In [ ]: # numerical solution # 1 - grid search thetas[which.max(L)] # 2 - random search # ... # 3 - algorithm # ... In [ ]: # analytical solution mean(X) In [ ]: hist(X) In [ ]: # if you have more data points X <- as.numeric(readLines("sample1000.txt")) mean(X) thetas <- seq(160, 190, 0.01) L <- c() for (theta in thetas) L <- c(L, prod(dnorm(X,theta,5))) thetas[which.max(L)] In [ ]: logL <- c() for (theta in thetas) logL <- c(logL, sum(dnorm(X,theta,5,log=TRUE))) thetas[which.max(logL)] In [ ]: plot(x=thetas, y=logL, xlab="theta", ylab="log-likelihood", type="l") 5) likelihood ratio test In [ ]: X <- as.numeric(readLines("sample20.txt")) thetas <- seq(160, 190, 0.01) logL <- c() for (theta in thetas) logL <- c(logL, sum(dnorm(X,theta,5,log=TRUE))) plot(x=thetas, y=logL, xlab="theta", ylab="log-likelihood", type="l") thetas[which.max(logL)] In [ ]: # is MLE different than 170? max(logL) logL[which(thetas==170)] In [ ]: # chi^2 2*(max(logL)-logL[which(thetas==170)]) In [ ]: # p-value 1-pchisq(2*(max(logL)-logL[which(thetas==170)]),df=1) In [ ]: # different than 174? 1-pchisq(2*(max(logL)-logL[which(thetas==174)]),df=1) # different than 175? 1-pchisq(2*(max(logL)-logL[which(thetas==175)]),df=1) In [ ]: # with more data X <- as.numeric(readLines("sample1000.txt")) thetas <- seq(160, 190, 0.01) logL <- c() for (theta in thetas) logL <- c(logL, sum(dnorm(X,theta,5,log=TRUE))) thetas[which.max(logL)] 1-pchisq(2*(max(logL)-logL[which(thetas==170)]),df=1) 1-pchisq(2*(max(logL)-logL[which(thetas==174)]),df=1) 1-pchisq(2*(max(logL)-logL[which(thetas==175)]),df=1) 6) bootstrapping In [ ]: # bootstrapping X <- as.numeric(readLines("sample1000.txt")) thetas <- seq(160, 190, 0.01) logL <- c() for (theta in thetas) logL <- c(logL, sum(dnorm(X,theta,5,log=TRUE))) theta_mle <- thetas[which.max(logL)] theta_boot <-c() for (i in 1:100) { Xs <- sample(X, replace=T) logL <- c() for (theta in thetas) logL <- c(logL, sum(dnorm(Xs,theta,5,log=TRUE))) theta_boot[i] <- thetas[which.max(logL)] cat("\n replicate:", i, " has estimate ", theta_boot[i]) } In [ ]: hist(theta_boot) abline(v=theta_mle, lty=2) quantile(theta_boot, c(0.025,0.975)) Goodness of fit¶ In [ ]: n <- 20 O <- c(2, 4, 14) O In [ ]: f <- 8/40 E <- c(f^2, 2*f*(1-f), (1-f)^2)*n E In [ ]: chi <- sum((E-O)^2/E) chi In [ ]: x = seq(0.1,40, 0.1) plot(x, dchisq(x, df=1), type="l") abline(v=chi, lty=2) In [ ]: 1- pchisq(chi, df=1) Hypothesis testing using a t-test¶ TASK Look at these examples where we test for difference in population means between two samples. Try to understand what the output of t.test is and why it changes for the different scenarios provided. In [ ]: pop1 <- rnorm(n=100, mean=5, sd=1) pop2 <- rnorm(n=100, mean=6, sd=1) t.test(x=pop1, y=pop2, alternative="two.sided") In [ ]: pop1 <- rnorm(n=100, mean=5, sd=1) pop2 <- rnorm(n=100, mean=5.5, sd=1) t.test(x=pop1, y=pop2, alternative="two.sided") In [ ]: pop1 <- rnorm(n=100, mean=5, sd=1) pop2 <- rnorm(n=100, mean=5.5, sd=1) t.test(x=pop1, y=pop2, alternative="less") In [ ]: pop1 <- rnorm(n=100, mean=5, sd=1) pop2 <- rnorm(n=100, mean=5.5, sd=1) t.test(x=pop1, y=pop2, alternative="greater") In [ ]: pop1 <- rnorm(n=10, mean=5, sd=1) pop2 <- rnorm(n=10, mean=5.5, sd=1) t.test(x=pop1, y=pop2, alternative="two.sided") In [ ]: pop1 <- rnorm(n=100, mean=5, sd=1) pop2 <- rpois(n=100, lambda=5) t.test(x=pop1, y=pop2, alternative="two.sided") # what is the expected value of a Poisson distribution? # why is the test non significant? # what test would you use here instead?