CS计算机代考程序代写 HW3

HW3

HW3

Run Tian

10/10/2021

Q1

mydata <- read.table("HW3 Data.txt") a data1 <- mydata[, c(8, 5, 4, 16)] colnames(data1) <- c("number_of_active_physicians", "total_population", "land_area", "total_personal_income") lm1 <- lm(number_of_active_physicians~., data = data1) coef(lm1) ## (Intercept) total_population land_area ## -1.331615e+01 8.366178e-04 -6.552296e-02 ## total_personal_income ## 9.413199e-02 data2 <- mydata[, c(8, 7, 16)] colnames(data2) <- c("number_of_active_physicians", "percent_of_population_65", "total_personal_income") data2$population_density <- mydata[, 5] / mydata[, 4] lm2 <- lm(number_of_active_physicians~., data = data2) coef(lm2) ## (Intercept) percent_of_population_65 total_personal_income ## -170.57422325 6.33984064 0.12656649 ## population_density ## 0.09615889 b model II is better, according R2adj (0.9111 > 0.902). We can also use R
2, because two model have same

number of predictors.
sum1 <- summary(lm1) sum1$r.squared ## [1] 0.9026432 sum1$adj.r.squared ## [1] 0.9019733 1 KEDONG CHEN sum2 <- summary(lm2) sum2$r.squared ## [1] 0.9117491 sum2$adj.r.squared ## [1] 0.9111419 c data3 <- mydata[, c(10, 15, 11, 17)] colnames(data3) <- c("total_serious_scrmes", "per_capita_income", "percent_high_school_graduates", "geographic_region") data3$population_density <- mydata[, 5] / mydata[, 4] lm3.1 <- lm(total_serious_scrmes~.-geographic_region, data = data3[data3$geographic_region == 1, ]) lm3.2 <- lm(total_serious_scrmes~.-geographic_region, data = data3[data3$geographic_region == 2, ]) lm3.3 <- lm(total_serious_scrmes~.-geographic_region, data = data3[data3$geographic_region == 3, ]) lm3.4 <- lm(total_serious_scrmes~.-geographic_region, data = data3[data3$geographic_region == 4, ]) coef(lm3.1) ## (Intercept) per_capita_income ## -64466.231202 -1.406392 ## percent_high_school_graduates population_density ## 1182.576712 17.382928 coef(lm3.2) ## (Intercept) per_capita_income ## -4163.2672991 0.1024233 ## percent_high_school_graduates population_density ## -2.7616306 33.6192856 coef(lm3.3) ## (Intercept) per_capita_income ## 38862.666850 1.957421 ## percent_high_school_graduates population_density ## -670.884233 5.537052 coef(lm3.4) ## (Intercept) per_capita_income ## 129323.415428 4.341646 ## percent_high_school_graduates population_density ## -2159.920387 5.717118 d No, they are different for the four regions. e MSE are different. 2 KEDONG CHEN c(anova(lm3.1)$`Mean Sq`[3], anova(lm3.2)$`Mean Sq`[3], anova(lm3.3)$`Mean Sq`[3], anova(lm3.4)$`Mean Sq`[3]) ## [1] 342416635109 82704839109 7416106467 6564391280 R2 are different. c(summary(lm3.1)$r.squared, summary(lm3.2)$r.squared, summary(lm3.3)$r.squared, summary(lm3.4)$r.squared) ## [1] 0.83520492 0.52851248 0.09250839 0.08665358 f ALl box plots have outliers, which far beyond the box plot. resi3.1 <- resid(lm3.1) resi3.2 <- resid(lm3.2) resi3.3 <- resid(lm3.3) resi3.4 <- resid(lm3.4) boxplot(resi3.1) − 1 5 0 0 0 0 − 5 0 0 0 0 5 0 0 0 0 boxplot(resi3.2) − 1 e + 0 5 1 e + 0 5 3 KEDONG CHEN boxplot(resi3.3) − 5 0 0 0 0 5 0 0 0 0 1 5 0 0 0 0 boxplot(resi3.4) − 1 e + 0 5 1 e + 0 5 3 e + 0 5 5 e + 0 5 g data4 <- as.data.frame(cbind(serious_crime_rate = mydata[, 10] / mydata[, 5], percent_unemployment = mydata[, 14], population_density = mydata[, 5] / mydata[, 4])) lm4 <- lm(serious_crime_rate~percent_unemployment*population_density + I(percent_unemployment^2) + I(population_density^2), data = data4) coef(lm4) ## (Intercept) percent_unemployment ## 6.477098e-02 -2.977931e-03 ## population_density I(percent_unemployment^2) ## -9.170961e-07 1.629277e-04 ## I(population_density^2) percent_unemployment:population_density ## 2.698158e-12 8.333685e-07 4 h plot(lm4$fitted.values, lm4$residuals, cex = 0.4) 0.05 0.10 0.15 0.20 0.25 − 0 .0 6 − 0 .0 2 0 .0 2 0 .0 6 lm4$fitted.values lm 4 $ re si d u a ls summary(lm4)$r.squared ## [1] 0.2484749 The range of residuals is between -0.06 and 0.06. The R2 is 0.2485, it means 24.85% variation of response variable has been explained by predictor variables, so we should add more predictor variables. i H0: all quadratic and interaction terms can be dropped from the regression model. H1: at least one quadratic or interaction terms can not be dropped from the regression model. I use F test lm4.2 <- lm(serious_crime_rate~percent_unemployment+population_density, data = data4) anova(lm4.2, lm4) ## Analysis of Variance Table ## ## Model 1: serious_crime_rate ~ percent_unemployment + population_density ## Model 2: serious_crime_rate ~ percent_unemployment * population_density + ## I(percent_unemployment^2) + I(population_density^2) ## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 437 0.25186
## 2 434 0.24638 3 0.005477 3.2159 0.02278 *
## —
## Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ‘ 1

Since F = 3.2159, p-value = 0.02278, we can reject null H0 at the 5% level of significance. It appears that at
least one quadratic or interaction terms can not be dropped from the regression model.

5

KEDONG CHEN

KEDONG CHEN

j

data5 <- mydata[, c(8, 5, 16, 17)] colnames(data5) <- c("number_of_active_physicians", "total_population", "total_personal_income", "geographic_region") data5$geographic_region <- relevel(factor(data5$geographic_region), 4) lm5 <- lm(number_of_active_physicians~., data = data5) coef(lm5) ## (Intercept) total_population total_personal_income ## -2.074957e+02 5.514599e-04 1.070115e-01 ## geographic_region1 geographic_region2 geographic_region3 ## 1.490196e+02 1.455264e+02 1.912163e+02 k mean difference: coef(lm5)[4] - coef(lm5)[5] ## geographic_region1 ## 3.493126 standard error: sqrt(86.83^2 + 85.15^2) ## [1] 121.614 t-stat: 3.493126/121.614 ## [1] 0.02872306 90% confidence interval: 3.493126 + c(-1, 1) * 1.65 * 121.614 ## [1] -197.1700 204.1562 There is a 90% chance that the difference between the effect of the northeastern region vs. the effect of the north central region on number of active physicians will fall in the interval. l lm5.2 <- lm(number_of_active_physicians~.-geographic_region, data = data5) anova(lm5.2, lm5) ## Analysis of Variance Table ## ## Model 1: number_of_active_physicians ~ (total_population + total_personal_income + ## geographic_region) - geographic_region ## Model 2: number_of_active_physicians ~ total_population + total_personal_income + ## geographic_region ## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 437 140967081
## 2 434 139093455 3 1873626 1.9487 0.121

6

KEDONG CHEN

KEDONG CHEN

KEDONG CHEN

KEDONG CHEN

KEDONG CHEN

KEDONG CHEN

p-value is 0.121.

Q2

library(wooldridge)
data(econmath)

a

The coefficient on male is 3.83, it means if two person have same colgpa, one is male and the other is female,
then the score of male is 3.83 larger than this of female on average.

b

3.83 + c(-1, 1) * qt(0.975, 856 – 3) * 0.74

## [1] 2.377566 5.282434

95% confidence interval is [2.378, 5.282], it does not contain 0.

c

H0: gender does not play a moderating role in the effect of colgpa on score.

H1: gender plays a moderating role in the effect of colgpa on score.

t test
tobs <- 0.479/1.383 tobs ## [1] 0.3463485 pvalue <- 2*(1-pt(tobs, 856-4)) pvalue ## [1] 0.7291663 test statistic = 0.346, p-value = 0.729 > 0.05, I can’t reject H0. It appears that gender does not play a
moderating role in the effect of colgpa on score.

d

I can expand equation 0.479 ∗ male(cologpa − 2.81) = 0.479 ∗ male ∗ colgpa − 0.479 ∗ 2.81 ∗ male = 0.479 ∗
male ∗ colgpa − 1.35 ∗ male, we can find it is just difference of the coefficient on male of third equation and
last equation. So it does not influence other variables.

e

actmiss <- ifelse(is.na(econmath$act), 1, 0) sum(actmiss) ## [1] 42 mean(actmiss) ## [1] 0.04906542 42 students are the ACT score missing, fraction is 0.049. 7 KEDONG CHEN KEDONG CHEN KEDONG CHEN KEDONG CHEN KEDONG CHEN f act0 <- econmath$act act0[is.na(econmath$act)] <- 0 mean(act0) ## [1] 21.98715 mean(econmath$act, na.rm = TRUE) ## [1] 23.12162 The average of act0 is 21.987. It’s smaller than the mean of act (23.122). g data1 <- econmath[, c('score', 'act')] lm1 <- lm(score~act, data = data1[complete.cases(data1), ]) coef(lm1) ## (Intercept) act ## 36.816043 1.548018 summary(lm1)$sigma ## [1] 12.26034 The slope coefficient is 1.548. the standard error 12.26. h lm2 <- lm(econmath$score~act0) coef(lm2) ## (Intercept) act0 ## 62.2926058 0.4687832 The slop coefficient is 0.469, it’s smaller than it in part (g). Because act0 has some 0, it will decrease coefficient. i lm3 <- lm(econmath$score~act0 + actmiss) coef(lm3) ## (Intercept) act0 actmiss ## 36.816043 1.548018 35.610861 The slope coefficient is 1.548. It’s same as part (g). j No, using all cases and adding the missing data estimator will increase standard error of coefficient for act. k 8 KEDONG CHEN KEDONG CHEN KEDONG CHEN KEDONG CHEN KEDONG CHEN KEDONG CHEN data1 <- cbind(data1, colgpa=econmath$colgpa) lm4.1 <- lm(score~act+colgpa, data = data1[complete.cases(data1), ]) summary(lm4.1) ## ## Call: ## lm(formula = score ~ act + colgpa, data = data1[complete.cases(data1), ## ]) ## ## Residuals: ## Min 1Q Median 3Q Max ## -39.492 -6.228 0.302 7.052 31.120 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.1189 2.8063 6.100 1.64e-09 ***
## act 0.8791 0.1164 7.552 1.16e-13 ***
## colgpa 12.5096 0.7230 17.302 < 2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 10.48 on 811 degrees of freedom ## Multiple R-squared: 0.3805, Adjusted R-squared: 0.379 ## F-statistic: 249.1 on 2 and 811 DF, p-value: < 2.2e-16 lm4.2 <- lm(econmath$score~act0 + actmiss + econmath$colgpa) summary(lm4.2) ## ## Call: ## lm(formula = econmath$score ~ act0 + actmiss + econmath$colgpa) ## ## Residuals: ## Min 1Q Median 3Q Max ## -41.732 -6.235 0.297 7.178 31.190 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.9776 2.8417 5.975 3.39e-09 ***
## act0 0.8743 0.1180 7.408 3.08e-13 ***
## actmiss 19.0859 3.2229 5.922 4.61e-09 ***
## econmath$colgpa 12.5993 0.7182 17.543 < 2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 10.66 on 852 degrees of freedom ## Multiple R-squared: 0.3699, Adjusted R-squared: 0.3677 ## F-statistic: 166.7 on 3 and 852 DF, p-value: < 2.2e-16 It doesn’t change my answer to part (j). 9 KEDONG CHEN Q1 a b c d e f g h i j k l Q2 a b c d e f g h i j k