—
title: “HW3”
output:
word_document: default
html_document: default
—
“`{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
“`
# Question 1
“`{r}
load(“401ksubs.RData”)
“`
## 1.a
the first 10 line of the data.
“`{r}
head(data,10)
“`
the summary of data
“`{r}
summary(data)
“`
five-number of each variable
“`{r}
sapply(data,quantile)
“`
Normal Q-Q Plot of income
“`{r}
qqnorm(data$inc)
qqline(data$inc)
“`
Normal Q-Q Plot of nettfa
“`{r}
qqnorm(data$nettfa)
qqline(data$nettfa)
“`
barplot of factor variable
“`{r}
library(ggpubr)
p1<-ggplot(data = data)+geom_bar(aes(x=as.factor(e401k)))
p2<-ggplot(data = data,aes(as.factor(marr)))+geom_bar()
p3<-ggplot(data = data,aes(as.factor(fsize)))+geom_bar()
p4<-ggplot(data = data,aes(as.factor(pira)))+geom_bar()
p5<-ggplot(data = data,aes(as.factor(age)))+geom_bar()
p6<-ggplot(data = data,aes(as.factor(male)))+geom_bar()
p7<-ggplot(data = data,aes(as.factor(p401k)))+geom_bar()
ggarrange(p1,p2,p3,p4,p5,p6,p7,ncol = 2,nrow = 4)
```
Histogram
```{r}
library(ggplot2)
ggplot(data = data,aes(inc))+geom_histogram(binwidth = 10)
ggplot(data = data,aes(nettfa))+geom_histogram(binwidth = 10)
ggplot(data,aes(inc))+geom_density()
ggplot(data=data,aes(inc,nettfa))+geom_point()+geom_smooth()
ggplot(data=data,aes(incsq,nettfa))+geom_point()+geom_smooth()
```
boxplot
```{r}
ggplot(data = data,aes(y=inc))+geom_boxplot()
ggplot(data = data,aes(y=nettfa))+geom_boxplot()
ggplot(data = data,aes(y=fsize))+geom_boxplot()
ggplot(data = data,aes(y=age))+geom_boxplot()
ggplot(data = data,aes(y=incsq))+geom_boxplot()
ggplot(data = data,aes(y=agesq))+geom_boxplot()
```
scatter matrix of the data
## 1.b
After transformation on inc and nettfa, from
the density curve and scatter plot, we can see that inc_ln and nettfa_ln's relationship is more linear.
```{r}
data$inc_ln<-log(data$inc)
data$nettfa_ln<-log(data$nettfa)
data<-data[data$nettfa!=0,]
ggplot(data=data,aes(inc))+geom_density()
ggplot(data=data,aes(inc_ln))+geom_density()
ggplot(data=data,aes(nettfa_ln))+geom_density()
ggplot(data=data,aes(inc,nettfa))+geom_point()+geom_smooth()
ggplot(data=data,aes(inc_ln,nettfa_ln))+geom_point()+geom_smooth()
```
## 1.c
```{r}
model_c = lm(nettfa~inc+age+e401k, data=data)
summary(model_c)
```
We can see that the p-value for coefficents are much less than 0.05,
we reject the null hypothesis that coefficient is 0. There is a significant relationship between nettfa and income, age, e401k.
The coefficient are all positive which indicates that income, age and
e401k are positve related to the nettfa. It conforms our intuition that people with larger income tend to have larger net financial wealth.
Older people tend to have larger net financial wealth. People eligible in e401k tend to have larger net financial wealth.
## 1.d
```{r}
plot(model_c$residuals, model_c$fitted.values, xlab="residual", ylab="fitted nettfa", title("residual vs fitted nettfa"))
```
```{r}
plot(data$nettfa, model_c$fitted.values, xlab="actual nettfa", ylab="fitted nettfa", title("actual nettfa vs fitted nettfa"))
```
From the two plots, we can see that for majority cases where the absolute value of net wealth not too large, the model fits them very well. For data with large absolute value of net wealth, the model has large error.
## 1.e
```{r}
model_e = lm(nettfa~inc + age + e401k + I(inc^2)+I(age^2), data=data)
summary(model_e)
```
Now the coefficient for e401k become larger.
## 1.f
```{r}
model_f = lm(nettfa~ inc + age + e401k + I(inc^2) + I(age^2) + (e401k:I(age - 41)) + (e401k:I((age - 41)^2)) , data=data)
summary(model_f)
```
The p-value for e401k:I(age - 41) is much less 0.05, whereas p value for e401k:I((age - 41)^2) is much larger than 0.05. Thus e401k:I(age - 41) is significant and e401k:I((age - 41)^2) is not.
I suggest to keep the significant term e401k:I(age - 41). It is better to remove non-significant term and keep the significant one.
## 1.g
The coefficient of e401k in (c) is 5.98908 and it is 9.7046880 in (e). The estimated effects
of e401k become larger.
## 1.h
```{r}
data$fsize1 = rep(0, nrow(data))
data$fsize2 = rep(0, nrow(data))
data$fsize3 = rep(0, nrow(data))
data$fsize4 = rep(0, nrow(data))
data$fsize5 = rep(0, nrow(data))
data$fsize1[data$fsize == 1] = 1
data$fsize2[data$fsize == 2] = 1
data$fsize3[data$fsize == 3] = 1
data$fsize4[data$fsize == 4] = 1
data$fsize5[data$fsize >= 5] = 1
model_h = lm(nettfa~inc + age + e401k + I(inc^2)+I(age^2) + fsize1 + fsize3 + fsize4 + fsize5, data=data)
summary(model_h)
“`
We can see that coefficients for fsize3, fsize4 and fsize5 are significant and coefficient for
fsize1 is not significant.
The coefficients for fsize3, fsize4 and fsize5 are all negative indicating large family size tend to have smaller net financial wealth.
## 1.i
model_f = lm(nettfa~ inc + age + e401k + I(inc^2) + I(age^2) + (e401k:I(age – 41)) + (e401k:I((age – 41)^2)) , data=data)
summary(model_f)
“`{r}
m1 = lm(nettfa~ inc + age + e401k + I(inc^2) + I(age^2) + fsize1, data = data)
m2 = lm(nettfa~ inc + age + e401k + I(inc^2) + I(age^2) + fsize2, data = data )
m3 = lm(nettfa~ inc + age + e401k + I(inc^2) + I(age^2) + fsize3, data = data)
m4 = lm(nettfa~ inc + age + e401k + I(inc^2) + I(age^2) + fsize4, data = data)
m5 = lm(nettfa~ inc + age + e401k + I(inc^2) + I(age^2) + fsize5, data = data)
“`
“`{r}
anova(model_e, m1, m2, m3, m4, m5)
“`
## 1.j
“`{r}
library(“margins”)
margin = margins(model_f)
summary(margin)
“`
“`{r}
plot(margin)
“`
From the plot, we can see that the marginal effect of age are much larger than inc and e401k.
## 1.k
“`{r}
library(plot3D)
scatter3D(data$age, data$inc, data$nettfa)
“`
“`{r}
plot(data$age, data$nettfa)
“`
“`{r}
plot(data$inc, data$nettfa)
“`
We can see that the optimal level of net financial wealth is about 1500k.
## 1.l
“`{r}
library(“margins”)
margin2 = margins(model_h)
summary(margin2)
plot(margin2)
“`
## 1.m
“`{r}
model_m = lm(nettfa_ln ~ inc_ln + age + e401k + I(inc_ln^2) + I(age^2) + (e401k:I(age – 41)) + (e401k:I((age – 41)^2)) , data=data)
summary(model_m)
“`
We can see that the new model’s Adjusted R-squared 0.358 is much larger than
the model in F which is 0.2036. I prefer this new model.
## 1.n
“`{r}
model_n = lm(nettfa~ inc + age + e401k + I(inc^2) + I(age^2) + (e401k:I(age – 41)) + (e401k:I((age – 41)^2)) + marr + male , data=data)
summary(model_n)
“`
“`{r}
anova(model_f, model_n)
“`
By intuition, whether the person is married or is man has impact on the net wealth. Thus I add two predictors `marr` and `male` to the model.
Then I use `anova` to compare the two models. The p value is much less than
0.001, thus the new model is significantly better than the model in (f).
## 1.o
“`{r}
set.seed(8888)
shuffled<-data[sample(nrow(data)),]
folds <- cut(seq(1,nrow(data)),breaks=5,labels=FALSE)
mses = rep(0, 5)
for(i in 1:5) {
testInds <- which(folds==i,arr.ind=TRUE)
testData <- shuffled[testInds, ]
trainData <- shuffled[-testInds, ]
m = lm(nettfa~ inc + age + e401k + I(inc^2) + I(age^2) + (e401k:I(age - 41)) + (e401k:I((age - 41)^2)) + marr + male , data=trainData)
pred = predict(m, testData)
mses[i] = mean( (testData$nettfa-pred )^2)
}
mses
mean(mses)
```
I conduct the five cross validation on the model from (n).
The average mean squared errors across the 5 folds is 3268.31.
# Question 2
## 2.a
```{r}
german_healthcare_usage <- read.csv("german_healthcare_usage.csv")
```
```{r}
# ALC ADDON
model_2_a = lm(DOCVIS ~ HEALTHY + HANDPER + LOGINC + EDUC + WORKING + PUBLIC, data = german_healthcare_usage)
summary(model_2_a)
```
I choose this model. I think self report health (HEALTHY) and degree of handicap (HANDPER)
has direct impact on doctor visits. Other factors such as income, years of education, whether
employed, insured will also impact the doctor visits.
From the model output, we can see that p-value for all coefficietns are less than 0.05. They are all significant. The Multiple R-squared is 0.112 and the Adjusted R-squared is 0.1118.
## 2.b
### 2.b.i
The control group is all men observation, the treatment group is all women observation, the null hypothesis is $H_0:\delta \geq0$, versus $H_1:\delta<0$.
```{r}
female<-german_healthcare_usage[which(german_healthcare_usage$FEMALE==1),]
male<-german_healthcare_usage[which(german_healthcare_usage$FEMALE==0),]
female$before<-as.numeric(female$YEAR1984|female$YEAR1985|female$YEAR1986)
female$after<-as.numeric(female$YEAR1987|female$YEAR1988|female$YEAR1991|female$YEAR1994)
male$before<-as.numeric(male$YEAR1984|male$YEAR1985|male$YEAR1986)
male$after<-as.numeric(male$YEAR1987|male$YEAR1988|male$YEAR1991|male$YEAR1994)
control_before<-male[which(male$before==1),]
control_after<-male[which(male$before==0),]
treatment_before<-female[which(female$before==1),]
treatment_after<-female[which(female$before==0),]
D_in_D<-(mean(treatment_after$DOCVIS)-mean(control_after$DOCVIS))-(mean(treatment_before$DOCVIS)-mean(control_before$DOCVIS))
D_in_D
```
The Differences-in-Differences estimator on women is less than 0, the policy is worked for women.
### 2.b.ii
The control group is all unemployed observation, the treatment group is all employed observation, the null hypothesis is $H_0:\delta \geq0$, versus $H_1:\delta<0$.
```{r}
unemployed<-german_healthcare_usage[which(german_healthcare_usage$UNEMPLOY==1),]
employed<-german_healthcare_usage[which(german_healthcare_usage$UNEMPLOY==0),]
unemployed$before<-as.numeric(unemployed$YEAR1984|unemployed$YEAR1985|unemployed$YEAR1986)
unemployed$after<-as.numeric(unemployed$YEAR1987|unemployed$YEAR1988|unemployed$YEAR1991|unemployed$YEAR1994)
employed$before<-as.numeric(employed$YEAR1984|employed$YEAR1985|employed$YEAR1986)
employed$after<-as.numeric(employed$YEAR1987|employed$YEAR1988|employed$YEAR1991|employed$YEAR1994)
control_before<-employed[which(employed$before==1),]
control_after<-employed[which(employed$before==0),]
treatment_before<-unemployed[which(unemployed$before==1),]
treatment_after<-unemployed[which(unemployed$before==0),]
D_in_D<-(mean(treatment_after$DOCVIS)-mean(control_after$DOCVIS))-(mean(treatment_before$DOCVIS)-mean(control_before$DOCVIS))
D_in_D
```
The Differences-in-Differences estimator on unemployed is less than 0, the policy is worked for unemployed.
## 2.c
The null hypothesis is $H_0:DOCVIS_{women}=DOCVIS_{men}$, the alternative hypothesis is $H_1:DOCVIS_{women}\neq DOCVIS_{men}$, we perform a Two Sample t-test here:
```{r}
men<-german_healthcare_usage[which(german_healthcare_usage$FEMALE==0),]
t.test(female$DOCVIS,men$DOCVIS, alternative = c("greater"))
```
The p value of Two Sample t-test is less than 0.05, which means that there is a reason to reject the null hypothesis, the number of doctor visits a patient has over a 3 month period is greater for women than for men.
## 2.d
We are interested in the effect of policy on civil servant, the regression model is $DOCVIS=\beta_0+\beta_1T+\beta_2S+\beta_3(T*S)+\varepsilon$, where $T$ is a dummy variable for the period, equal tp 1 when after, and $S$ is a dummy variable for group membership, equal to 1 when treatment.
```{r}
beamt<-german_healthcare_usage[which(german_healthcare_usage$BEAMT==1),]
beamt$T<-as.numeric(beamt$YEAR1987|beamt$YEAR1988|beamt$YEAR1991|beamt$YEAR1994)
beamt$T_DOCTOR<-beamt$T*beamt$DOCTOR
reg2<-lm(DOCVIS~DOCTOR+T+T_DOCTOR,data = beamt)
summary(reg2)
```
The Differences-in-Differences estimator on civil servant is -0.058 less than 0, the policy is worked for civil servant, and it is significant at 0.1 significant level.