#install.packages(“tidyverse”) # install package
library(tidyverse) # load package
## Import a csv file
TB <- read.csv("TB_burden_countries_2020-01-09.csv")
View(TB)
# View can view some summary statistics with summary()
# but it is not easy to read
summary(TB)
# We can use library stargazer to produce better summary output
library(stargazer)
stargazer(TB, type = "text")
# How can you add the median to stargazer output? (use the help)
# Useful Tidyverse functions are filter and select
# select helps us filter out columns
# e.g., select all columns from 'country' to 'e_pop_num'
TB1 <- TB %>%
select(country:e_pop_num)
View(TB1)
# filter helps us filter the observations by some criteria
# e.g., filter only data belonging to region EUR
TB2 <- TB %>%
filter(g_whoregion == “EUR”)
View(TB2)
# We can also apply both transformations together
TB3 <- TB %>%
select(country:e_pop_num) %>%
filter(g_whoregion == “EUR”)
View(TB3)
# with basic R we would have needed the following code to get TB3
# TB3 <- TB[TB$g_whoregion == "EUR", 1:7]
# which can get complicated and difficult to read the more operations we perform on TB
# Another useful functions is the function gather
# it can be used to stack variables one below the other
# in the code below I am stacking one below the others all the variables
# from 'e_mort_exc_tbhiv_100k' to 'e_mort_num_hi'
TB4 <- TB %>%
gather(e_mort_exc_tbhiv_100k:e_mort_num_hi,
key = “e_mort_key”,
value = “e_mort_value”)
View(TB4)
# we can also chain gather with other functions like select
TB %>%
select(country:e_pop_num, e_mort_exc_tbhiv_100k:e_mort_num_hi) %>%
gather(e_mort_exc_tbhiv_100k:e_mort_num_hi,
key = “e_mort_key”, value = “e_mort_value”)
# if we need to create new variables we can use mutate
# e.g., we create a new var which is population divided 100k
TB5 <- TB %>%
mutate(e_pop_100k = e_pop_num/10^5)
View(TB5)
# we can use group_by to perform operations like mutate by groups
# like calculating averaging population by region and year
# and add it as a new var
TB6 <- TB %>%
group_by(g_whoregion, year) %>%
mutate(avg_pop = mean(e_pop_num))
View(TB6)
# if we just want a summary by groupe we can use summarise instead
TB %>%
group_by(g_whoregion, year) %>%
summarise(avg_pop = mean(e_pop_num))
# finaly we can use also chain other R function like lm()
TB %>%
mutate(e_pop_mil = e_pop_num/10^6) %>%
lm(e_inc_100k ~ e_pop_mil, data = .) %>%
summary()
# we can use stargazer instead of summary to get a better output
TB %>%
mutate(e_pop_mil = e_pop_num/10^6) %>%
lm(e_inc_100k ~ e_pop_mil, data = .) %>%
stargazer(type = “text”)
# finally we can chain group_by and lm to get a linear model by group
# however, we need to install package broom
library(broom)
summary.ols <- TB %>%
mutate(e_pop_mil = e_pop_num/10^6) %>%
group_by(country) %>%
group_modify(~tidy(lm(e_inc_100k ~ e_pop_mil, data = .)))
View(summary.ols)
# Questions
# Is it sensible to remove all NAs in dataset ‘TB’?
# Take e_mort_100k as response, can you build a regression model based on this dataset? Which variables would you choose as covariates?
# Hint: Use the TB_data_dictionary file to understand the meaning of each variable.
# Which model works has the smallest MSE?