Provide download options in this chunk.
download <- list()
download$corona_data <- TRUE
download$wdi_data <- FALSE
download$qog_data <- FALSE
download$jhu_data <- FALSE
download$polity_data <- FALSE
download$swiid_data <- FALSE
download$resp_data <- FALSE
download$epr_data <- FALSE
download$who_data <- FALSE
download$weather_data <- FALSE
download$dpi_data <- FALSE
download$parlgov_data <- FALSE
download$wiki_data <- FALSE
download$ipu_data <- FALSE
download$ilo_data <- FALSE
download$iefs_data <- FALSE
download$mobility_data <- FALSE
download$stringency_data <- FALSE
download$vdem_data <- FALSE
download$parlgov_data <- FALSE
download$excess_deaths <- FALSE
download$marpor_data <- FALSE # Keep FALSE unless API key available
download$wvs_data <- FALSE # Keep FALSE unless needed data sets are
# downloaded manually and placed in the
# needed folders (see instructions in
# the sections below)
# If `corona_data` is predefined in `0_master`, use this value
if(exists("corona_data")) download$corona_data <- corona_data
Download data from public sources.
Provide data download options in this chunk. Setting any of the values to TRUE
will result in the corresponding data set to be downloaded from the Internet. Keeping a value as FALSE
will result in it being read from the saved data sets in our repository.
If the corona_data
object is pre-defined in the 0_master.Rmd
file, that value is carried over here, and overrules the preference expressed here. If the download
list has already been defined in 1_1_dataprep.Rmd
this chunk will not be evaluated.
Script was last tested, including downloading of data on the predictors used in the model, on November 16, 2020.
download <- list()
download$corona_data <- FALSE
download$wdi_data <- FALSE
download$qog_data <- FALSE
download$jhu_data <- FALSE
download$polity_data <- FALSE
download$swiid_data <- FALSE
download$resp_data <- FALSE
download$epr_data <- FALSE
download$who_data <- FALSE
download$weather_data <- FALSE
download$dpi_data <- FALSE
download$wiki_data <- FALSE
download$ipu_data <- FALSE
download$ilo_data <- FALSE
download$iefs_data <- FALSE
download$mobility_data <- FALSE
download$stringency_data <- FALSE
download$vdem_data <- FALSE
download$parlgov_data <- FALSE
download$excess_deaths <- FALSE
download$marpor_data <- FALSE # Keep FALSE unless API key available
download$wvs_data <- FALSE # Keep FALSE unless needed data sets are
# downloaded manually and placed in the
# needed folders (see instructions in
# the sections below)
# If `corona_data` is predefined in `0_master`, use this value
if(exists("corona_data")) download$corona_data <- corona_data
This data reports only cases and deaths per day. In the code below we compute cumulative case counts and calculate growth rates. We download the data available on the day we reported our latest set of results in the manuscript, June 10, 2020.
The data is produced by the European Centre for Disease Prevention and Control (ECDC), which is also the owner of the data. The ECDC’s copyright policy permits the redistribution of the data.
if(download$corona_data){
# url <- paste("https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-",
# format(Sys.time(), "%Y-%m-%d"),
# ".xlsx",
# sep = "")
url <- "https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-2020-12-14.xlsx"
GET(url,
authenticate(":", ":", type="ntlm"),
write_disk(tf <- tempfile(fileext = ".xlsx")))
write.csv(read_excel(tf),
file = "saved/corona_download.csv",
row.names = FALSE)
# Weekly data
read.csv("https://opendata.ecdc.europa.eu/covid19/nationalcasedeath/csv",
na.strings = "",
fileEncoding = "UTF-8-BOM") %>%
write.csv(file = "saved/corona_download_weekly.csv",
row.names = FALSE)
}
Code incorporates custom function generously contributed by Haoyu Zhai (EUI).
# Reshape and clean up the data
w1.2020 <- ymd("2019-12-30") # first week of 2020 (iso-8601)
w1.2021 <- ymd("2021-01-04") # first week of 2021 (iso-8601)
# First week of 2020 is considered as having started on December 30 (Monday)
corona <- read.csv("saved/corona_download_weekly.csv") %>%
dplyr::select(-rate_14_day, -source, -cumulative_count) %>%
pivot_wider(names_from = "indicator",
values_from = "weekly_count") %>%
rename(population_2019 = population,
country_id = country_code) %>%
mutate(date = case_when(grepl("2020", year_week) ~ yw_to_date(year_week,
w1 = w1.2020), # 2020 dates
grepl("2021", year_week) ~ yw_to_date(year_week,
w1 = w1.2021), # 2021 dates
TRUE ~ NA_Date_
)) %>%
mutate(date_rep = as.character(date)) %>%
filter(!(country %in% c("Africa (total)", "America (total)",
"Asia (total)", "EU/EEA (total)",
"Europe (total)", "Oceania (total)"))) %>%
mutate(country = as.character(country),
country_id = as.character(country_id))
# Minimum is December 30, 2019, so this should cover the entire
# time period
drange <- c(seq(min(corona$date),
max(corona$date),
by = "week"))
# 2-letter country ID should be avoided due to Namibia (NA) problem
rectangle <- expand_grid(unique(corona$country_id), drange)
names(rectangle) <- c("country_id", "date")
rectangle <- rectangle %>%
mutate(month = format(as.POSIXct(date), "%m"),
day = format(as.POSIXct(date), "%d"),
year = format(as.POSIXct(date), "%Y"),
elapsed = as.integer(-1 + date - min(date)))
corona <- left_join(rectangle, corona) %>%
arrange(country_id, elapsed) %>%
mutate(deaths = as.numeric(deaths),
cases = as.numeric(cases),
deaths = if_else(is.na(deaths), 0, deaths),
cases = if_else(is.na(cases), 0, cases)) %>%
mutate(date_rep = as.character(date)) %>%
group_by(country_id) %>%
mutate(cases_cum = cumsum(cases),
deaths_cum = cumsum(deaths),
deaths_cum_log = log(1+deaths_cum),
deaths_cum_l7 = lag(deaths_cum, n = 1, order_by = elapsed),
deaths_cum_g7 = lag(deaths_cum/deaths_cum_l7 - 1, order_by = elapsed),
deaths_cum_g7 = ifelse(deaths_cum_g7 == Inf, NA, deaths_cum_g7)) %>%
ungroup() %>%
mutate(country = as.character(country),
country_id = as.character(country_id),
country_id = if_else(country_id == "XKX", "RKS", country_id)) %>%
filter(!is.na(country_id)) %>% # Remove "Western Sahara"
group_by(country_id) %>%
mutate(continent = na.aggregate(continent, FUN = .mode)) %>%
ungroup()
corona <- corona %>%
group_by(country_id) %>%
mutate(country = .mode(na.omit(country))) %>%
ungroup()
write.csv(corona,
file = "saved/corona_df.csv",
row.names = FALSE)
rm(corona)
# HT reshape code from https://www.r-bloggers.com/tidying-the-new-johns-hopkins-covid-19-time-series-datasets/
confirmed_raw <- read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
deaths_raw <- read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
.jhd_to_long <- function(df) {
df_str <- deparse(substitute(df))
var_str <- substr(df_str, 1, str_length(df_str) - 4)
df %>% group_by(`Country/Region`) %>%
filter(`Country/Region` != "Cruise Ship") %>%
select(-`Province/State`, -Lat, -Long) %>%
mutate_at(vars(-group_cols()), sum) %>%
distinct() %>%
ungroup() %>%
rename(country = `Country/Region`) %>%
pivot_longer(
-country,
names_to = "date_str",
values_to = var_str
) %>%
mutate(date = mdy(date_str)) %>%
select(country, date, !! sym(var_str))
}
jh_covid19_data <- .jhd_to_long(confirmed_raw) %>%
full_join(.jhd_to_long(deaths_raw))
jh_covid19_data <- jh_covid19_data %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c"),
country_id = if_else(country=="Kosovo", "RKS", country_id)) %>%
dplyr::select(-country)
write.csv(jh_covid19_data,
file = "./saved/c19_data_JHU_df.csv",
row.names = FALSE)
rm(jh_covid19_data)
Retrieve data from the World Development Indicators.
wdi <- WDI(
country = "all",
indicator = c(
gov_effect = "GE.EST",
trade = "NE.TRD.GNFS.ZS",
ineq = "SI.DST.10TH.10",
gdp_pc = "NY.GDP.PCAP.PP.KD",
pop_tot = "SP.POP.TOTL",
older_m = "SP.POP.65UP.MA.IN",
older_f = "SP.POP.65UP.FE.IN",
air_travel = "IS.AIR.PSGR",
fdi = "BX.KLT.DINV.CD.WD",
pop_density = "EN.POP.DNST",
urban = "EN.URB.MCTY.TL.ZS",
pop_below14_2018 = "SP.POP.0014.TO.ZS"),
start = 2018, end = 2018, extra = FALSE, cache = NULL)
# Migration share from 2015
wdi <- left_join(wdi, WDI(country = "all",
indicator = c(migration_share = "SM.POP.TOTL.ZS"),
start = 2015, end = 2015,
extra = FALSE, cache = NULL) %>%
select(iso2c, migration_share))
## Oil from 2017
wdi <- left_join(wdi, WDI(country = "all",
indicator = c(oil = "NY.GDP.PETR.RT.ZS",
life_exp_2017 = "SP.DYN.LE00.IN"),
start = 2017, end = 2017,
extra = FALSE, cache = NULL) %>%
select(iso2c, oil, life_exp_2017))
socin <- WDI(country = "all",
indicator = c(soc_insur_cov ='PER_SI_ALLSI.COV_POP_TOT'),
start = 2010, end = 2018) %>%
filter(!is.na(soc_insur_cov)) %>%
group_by(country) %>%
arrange(year) %>%
mutate(soc_insur_cov = last(soc_insur_cov)) %>%
ungroup() %>%
distinct(country, .keep_all = T)
wdi <- left_join(wdi, socin %>% dplyr::select(iso2c, soc_insur_cov))
## Social contributions
soccon <- WDI(country = "all",
indicator = c(soc_contrib ='GC.REV.SOCL.ZS'),
start = 2010, end = 2018) %>%
filter(!is.na(soc_contrib)) %>%
group_by(country) %>%
arrange(year) %>%
mutate(soc_contrib = last(soc_contrib)) %>%
ungroup() %>%
distinct(country, .keep_all = T)
wdi <- left_join(wdi, soccon %>% dplyr::select(iso2c, soc_contrib))
## Social safety
soc_safety <- WDI(country = "all",
indicator = c(soc_safety ='PER_SA_ALLSA.COV_POP_TOT'),
start = 2010, end = 2018) %>%
filter(!is.na(soc_safety)) %>%
group_by(country) %>%
arrange(year) %>%
mutate(soc_safety = last(soc_safety)) %>%
ungroup() %>%
distinct(country, .keep_all = T)
wdi <- left_join(wdi, soc_safety %>% dplyr::select(iso2c, soc_safety)) %>%
mutate(country_id = countrycode(iso2c,
origin = "iso2c",
destination = "iso3c")) %>%
mutate(country_id = if_else(country=="Netherlands Antilles", "ANT", country_id),
country_id = if_else(country=="Namibia", "NAM", country_id),
country_id = if_else(country=="Kosovo", "RKS", country_id)) %>%
dplyr::select(-c(year, iso2c, country)) %>%
filter(!(is.na(country_id)))
write.csv(wdi,
file = "saved/wdi_df.csv",
row.names = FALSE)
rm(wdi)
Retrieve indicators from the Quality of Government data, January 2020 version.
qogDF <- read.csv(url("http://www.qogdata.pol.gu.se/data/qog_std_cs_jan20.csv"),
header = TRUE) %>%
mutate(country = as.character(cname)) %>%
dplyr::select(country, vdem_libdem, al_ethnic2000,
al_religion2000, fe_etfra, vdem_mecorrpt) %>%
rename(al_etfra = al_ethnic2000,
al_religfra = al_religion2000)
# Recoding of country names
qogDF <- qogDF %>%
mutate(country = recode(country,
"Brunei"= "Brunei Darussalam",
"Cape Verde" = "Cabo Verde",
"Taiwan" = "Taiwan, China",
"Congo" = "Congo, Rep.",
"Congo, Democratic Republic" = "Congo, Dem. Rep.",
"Cyprus (1975-)" = "Cyprus",
"Ethiopia (1993-)" = "Ethiopia",
"France (1963-)" = "France",
"Gambia" = "Gambia, The",
"Iran" = "Iran, Islamic Rep.",
"Korea, South" = "Korea, Rep.",
"Kyrgyzstan" = "Kyrgyz Republic",
"Malaysia (1966-)" = "Malaysia",
"Pakistan (1971-)" = "Pakistan",
"Russia" = "Russian Federation",
"St Vincent and the Grenadines" = "St. Vincent and the Grenadines",
"Slovakia" = "Slovak Republic",
"Sudan (2012-)" = "Sudan",
"Syria" = "Syrian Arab Republic",
"Egypt" = "Egypt, Arab Rep.",
"Venezuela" = "Venezuela, RB")) %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
mutate(country_id = if_else(country=="Micronesia", "FSM", country_id)) %>%
dplyr::select(-country)
write.csv(qogDF,
file = "./saved/qog_df.csv",
row.names = FALSE)
rm(qogDF)
Retrieve Polity score from the Polity IV data.
download.file("http://www.systemicpeace.org/inscr/p4v2018.xls",
"./saved/p4v2018.xls",
mode = "wb",
quiet = TRUE)
polityDF <- read_excel(path = "./saved/p4v2018.xls",
col_names = TRUE)
# Cleanup and ID recoding
polityDF <- polityDF %>%
dplyr::select(scode, year, polity2) %>%
filter(year==2018) %>%
mutate(
country_id = countrycode(sourcevar = scode,
origin = "p4c",
destination = "iso3c"),
country_id = if_else(scode=="KOS", "RKS", country_id)) %>%
rename(polity = polity2) %>%
dplyr::select(-c(scode,year))
write.csv(polityDF,
file = "./saved/polity4_df.csv",
row.names = FALSE)
rm(polityDF)
file.remove("./saved/p4v2018.xls")
Get the Gini index of net income inequality from the SWIID data, version 8.2.
download.file("https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/LM4OWF/DBWK5H",
"./saved/swiid8_2.zip",
mode = "wb",
quiet = TRUE)
unzip("./saved/swiid8_2.zip",
files = "swiid8_2/swiid8_2_summary.csv",
exdir = "./saved")
file.remove("./saved/swiid8_2.zip")
file.copy(from = "./saved/swiid8_2/swiid8_2_summary.csv",
to = "./raw/swiid8_2_summary.csv")
file.remove("./saved/swiid8_2/swiid8_2_summary.csv")
unlink("./saved//swiid8_2", recursive = TRUE)
swiidDF <- read.csv(file = "./raw/swiid8_2_summary.csv",
header = TRUE)
swiidDF <- swiidDF %>%
dplyr::select(country, year, gini_disp) %>%
rename(gini = gini_disp) %>%
filter(year>=2010) %>%
group_by(country) %>%
summarise(gini = mean(gini, na.rm = TRUE)) %>%
mutate(country_id = countrycode(sourcevar = country,
origin = "country.name",
destination = "iso3c")) %>%
mutate(country_id = if_else(country=="Kosovo", "RKS", country_id)) %>%
mutate(country_id = if_else(country=="Micronesia", "FSM", country_id)) %>%
mutate(country_id = if_else(country=="São Tomé and PrÃncipe", "STP", country_id)) %>%
dplyr::select(-c(country))
write.csv(swiidDF,
file = "./saved/swiid_df.csv",
row.names = FALSE)
rm(swiidDF)
file.remove("./raw/swiid8_2_summary.csv")
Include information on the policy responses countries adopt, from ACAPS.
download.file("https://www.acaps.org/sites/acaps/files/resources/files/acaps_covid19_government_measures_dataset.xlsx",
"./saved/acaps_current.xlsx",
mode = "wb",
quiet = TRUE)
resp <- read_excel(path = './saved/acaps_current.xlsx',
sheet = 2,
col_names = TRUE) %>%
filter(is.na(ADMIN_LEVEL_NAME)) %>% ## only national policies
mutate(date = as.Date(DATE_IMPLEMENTED))
# Cleanup
file.remove("./saved/acaps_current.xlsx")
## Change column names to lower
colnames(resp) <- tolower(colnames(resp))
## Drop some more vars
resp <- resp %>%
dplyr::select(id, country, region, category, measure, date) %>%
mutate(measure = str_squish(measure))
## Need to create larger categories
lockdown <- c('Partial lockdown', 'Full lockdown')
distancing <- c('Limit public gatherings', 'Schools closure',
'Public services closure', 'Curfews')
## Dummy indicators for this
resp <- resp %>%
mutate(lockdown_measures = if_else(measure %in% lockdown, 1, 0),
distancing_measures = if_else(measure %in% distancing, 1, 0)) %>%
group_by(country, region, date) %>%
summarise(lockdown_measures = sum(lockdown_measures, na.rm = T),
distancing_measures = sum(distancing_measures, na.rm = T)) %>%
ungroup() %>%
mutate(distancing_bin = if_else(distancing_measures > 0 ,1, 0),
lockdown_bin = if_else(lockdown_measures > 0, 1, 0)) %>%
mutate(country = if_else(country=="CAR", "Central African Republic", country),
country_id = countrycode(country,
origin = "country.name",
destination = "iso3c"),
country_id = if_else(country=="Micronesia", "FSM", country_id)) %>%
dplyr::select(-country)
## Merge with corona structure
cor_merge <- read.csv("saved/corona_df.csv") %>%
dplyr::select(country_id, date_rep) %>%
filter(!is.na(date_rep) & !is.na(country_id)) %>%
mutate(date = as.Date(as.character(date_rep))) %>%
dplyr::select(-date_rep)
## Merge
cor_merge <- cor_merge %>%
left_join(resp)%>%
arrange(country_id, date) %>%
mutate_at(vars(lockdown_measures, distancing_measures),
list(~if_else(is.na(.), 0, .)))
## Get cumulative sum of measures, and then generate dummy
## indicators
cor_merge <- cor_merge %>%
group_by(country_id) %>%
mutate(lockdown_n = cumsum(lockdown_measures),
distancing_n = cumsum(distancing_measures),
distancing_bin = if_else(distancing_n > 0 ,1, 0),
lockdown_bin = if_else(lockdown_n > 0 ,1, 0)) %>%
dplyr::select(-matches('measures'))
## Days since first distancing / lock down
cor_merge_policy <- cor_merge %>%
group_by(country_id) %>%
mutate(days_rel_ldown = date - date[lockdown_bin == 1][1],
days_rel_dstng = date - date[distancing_bin == 1][1]) %>%
ungroup() %>%
dplyr::select(country_id, date, matches('distancing|lockdown'))
# Azerbaijan has 2 observations for "February 29", producing a duplicate in
# the Covid-19 data.
cor_merge_policy <- cor_merge_policy %>%
slice(-which(cor_merge_policy$country_id=="AZE" &
cor_merge_policy$date=="2020-02-29")[1])
write.csv(cor_merge_policy,
file = "saved/acaps_df.csv",
row.names = FALSE)
# No correspondence could be found for code "JPG11668"
# This could now be merged to the main DF
rm(resp, cor_merge, cor_merge_policy, distancing, lockdown)
Population median age in 2013, obtained from the World Health Organization’s Global Health Observatory (https://apps.who.int/gho/data/view.main.POP2040). Based on the Terms and Conditions of use, the data can be redistributed.
med_age <- read_csv('raw/median_age_who.csv') %>%
rename(country = 1, med_age_2013 = 2, temp = 3) %>%
dplyr::select(-3) %>%
slice(-1) %>%
mutate(country = recode(country,
`Bolivia (Plurinational State of)` = 'Bolivia',
`Congo` = 'Congo, Rep.',
`Czechia` = 'Czech Republic',
`Democratic Republic of the Congo` = 'Congo, Dem. Rep.',
`United States of America` = 'United States',
`United Kingdom of Great Britain and Northern Ireland` = 'United Kingdom',
`Egypt` = 'Egypt, Arab Rep.',
`Iran (Islamic Republic of)` = 'Iran, Islamic Rep.',
`Venezuela (Bolivarian Republic of)` = 'Venezuela, RB',
`Republic of Korea` = 'Korea, Rep.',
`Democratic People's Republic of Korea` = 'Korea, North',
`Slovakia` = 'Slovak Republic',
`Kyrgyzstan` = 'Kyrgyz Republic',
`Viet Nam` = 'Vietnam',
`Gambia` = 'Gambia, The',
`United Republic of Tanzania` = 'Tanzania'))
# Create 3-letter code for country
med_age <- med_age %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c"),
country_id = if_else(country=="Eswatini", "SWZ", country_id)) %>%
dplyr::select(-country)
write.csv(med_age,
file = "saved/median_age_df.csv",
row.names = FALSE)
rm(med_age)
The GHSI data has been produced by the Nuclear Threat Initiative (NTI) and the Johns Hopkins Center for Health Security (JHU), in partnership with The Economist Intelligence Unit (EIU). The data was obtained from the following address: https://www.ghsindex.org/wp-content/uploads/2019/10/Global-Health-Security-Index-2019-Final-October-2019.zip. The NTI and the JHU-CHS are the legal owners of the data; they have granted permission to the research team to redistribute select indicators from the data on October 22, 2020.
Due to difficulties in reading data from the .xlsm
data format, specific sub-components of the GHSI were made available here in .CSV
format: preparedness, prevention, capacity for pandemic detection and reporting, capacity for response, along with a health index.
ghi <- read.csv('./raw/global health security index raw.csv',
header = TRUE) %>%
t() %>%
data.frame(stringsAsFactors = F)
colnames(ghi) <- ghi[1, ]
ghi$country <- row.names(ghi)
# Remove top 3 rows
ghi <- ghi %>%
slice(4:n())
## Select some vars
keep <- c('6.5.2b', '6.5.3a', '6.2.1a', '4.1.2a',
'4.1.1a', '6.5.1b', 'BG4', 'country')
keep_labels <- c('access_sanitation', 'health_exp_capita',
'adult_lit_rate', 'hosp_beds_capita',
'doctors_capita','healthcare_qual_index',
'hdi')
# Have to recode health expenditure per capita, since the
# thousands separator in the numbers are leading to NAs being
# created.
ghi <- ghi %>%
dplyr::select(matches(paste0(keep, collapse = '|'))) %>%
dplyr::select(-matches('field epi|airlines|capacity|country vulner')) %>%
rename(doctors_pc = 1,
hosp_beds_pc = 2,
literacy_rate = 3,
healthcare_qual = 4,
acc_sanitation = 5,
health_exp_pc = 6,
hdi = 7) %>%
mutate_at(vars(-country), removeFUN) %>%
mutate_at(vars(-country), replaceFUN) %>%
mutate_at(vars(-country), as.numeric) %>%
filter(!is.na(country))
## Recode some countries
ghi <- ghi %>%
mutate(country = recode(country,
`Congo (Democratic Republic)` = 'Congo, Dem. Rep.',
`South Korea` = 'Korea, Rep.',
`Syria` = 'Syrian Arab Republic',
`Venezuela` = 'Venezuela, RB',
`Slovakia` = 'Slovak Republic',
`Egypt` = 'Egypt, Arab Rep.',
`Iran` = 'Iran, Islamic Rep.',
`Russia` = 'Russian Federation',
`Gambia` = 'Gambia, The',
`Congo (Brazzaville)` = 'Congo, Rep.'))
ghi$country[grep("Ivoire", ghi$country)] <- "Cote d'Ivoire"
ghi$country[grep("ncipe", ghi$country)] <- "Sao Tome and Principe"
ghi <- ghi %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c"),
country_id = if_else(country=="Micronesia", "FSM", country_id)) %>%
dplyr::select(-country)
write.csv(ghi,
file = "saved/ghsi_df.csv",
row.names = FALSE)
rm(ghi)
panprep <- read_csv('raw/pandemic_preparedness.csv',
col_names = F) %>%
rename(country = 2, pandemic_prep = 3) %>%
dplyr::select(-1) %>%
mutate(country = recode(country,
"Congo Democratic Republic" = "Congo, Dem. Rep.",
"South Korea" = "Korea, Rep.",
"Syria" = "Syrian Arab Republic",
"Venezuela" = "Venezuela, RB",
"Slovakia" = "Slovak Republic",
"Egypt" = "Egypt, Arab Rep.",
"Iran" = "Iran, Islamic Rep.",
"Russia" = "Russian Federation",
"Gambia" = "Gambia, The",
"Congo Brazzaville" = "Congo, Rep."))
panprep$country[grep("Ivoire",panprep$country)] <- "Cote d'Ivoire"
panprep$country[grep("ncipe", panprep$country)] <- "Sao Tome and Principe"
panprep <- panprep %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
mutate(country_id = if_else(country=="Micronesia", "FSM", country_id)) %>%
dplyr::select(-country)
write.csv(panprep,
file = "saved/pandemic_prep_df.csv",
row.names = FALSE)
rm(panprep)
prevent_df <- read_csv('raw/prevent_GHSI.csv') %>%
select(country, prevent_index) %>%
mutate(country = recode(country,
`South Korea` = 'Korea, Rep.',
`Congo (Democratic Republic)` = 'Congo, Dem. Rep.',
`Congo (Brazzaville)` = 'Congo, Rep.',
`Egypt` = 'Egypt, Arab Rep.',
`Iran` = 'Iran, Islamic Rep.',
`Russia` = 'Russian Federation',
`Venezuela` = 'Venezuela, RB',
`Syria` = 'Syrian Arab Republic',
`Laos` = 'Lao PDR',
`Gambia` = 'Gambia, The',
`Slovakia` = 'Slovak Republic',
`eSwatini (Swaziland)` = 'Eswatini'
))
prevent_df$country[grep("Ivoire", prevent_df$country)] <- "Cote d'Ivoire"
prevent_df$country[grep("ncipe", prevent_df$country)] <- "Sao Tome and Principe"
# Switch to 3-letter country code
prevent_df <- prevent_df %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
mutate(country_id = if_else(country=="Eswatini", "SWZ", country_id),
country_id = if_else(country=="Micronesia", "FSM", country_id)) %>%
dplyr::select(-country)
write.csv(prevent_df,
"saved/ghsi_prevent_df.csv",
row.names = FALSE)
rm(prevent_df)
detect_df <- read_csv('raw/detect_report_GHSI.csv') %>%
mutate(country = recode(country,
`South Korea` = 'Korea, Rep.',
`Congo (Democratic Republic)` = 'Congo, Dem. Rep.',
`Congo (Brazzaville)` = 'Congo, Rep.',
`Egypt` = 'Egypt, Arab Rep.',
`Iran` = 'Iran, Islamic Rep.',
`Russia` = 'Russian Federation',
`Venezuela` = 'Venezuela, RB',
`Syria` = 'Syrian Arab Republic',
`Laos` = 'Lao PDR',
`Gambia` = 'Gambia, The',
`Slovakia` = 'Slovak Republic',
`eSwatini (Swaziland)` = 'Eswatini'
))
detect_df$country[grep("Ivoire", detect_df$country)] <- "Cote d'Ivoire"
detect_df$country[grep("ncipe", detect_df$country)] <- "Sao Tome and Principe"
# Switch to 3-letter code for country
detect_df <- detect_df %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
mutate(country_id = if_else(country=="Micronesia", "FSM", country_id)) %>%
dplyr::select(-country) %>%
rename(detect_index = detect_report_index)
write.csv(detect_df,
file = "saved/ghsi_detect_df.csv",
row.names = FALSE)
rm(detect_df)
respond_df <- read_csv('raw/respond_GHSI.csv') %>%
mutate(country = recode(country,
`South Korea` = 'Korea, Rep.',
`Congo (Democratic Republic)` = 'Congo, Dem. Rep.',
`Congo (Brazzaville)` = 'Congo, Rep.',
`Egypt` = 'Egypt, Arab Rep.',
`Iran` = 'Iran, Islamic Rep.',
`Russia` = 'Russian Federation',
`Venezuela` = 'Venezuela, RB',
`Syria` = 'Syrian Arab Republic',
`Laos` = 'Lao PDR',
`Gambia` = 'Gambia, The' ,
`Slovakia` = 'Slovak Republic',
`eSwatini (Swaziland)` = 'Eswatini'
))
respond_df$country[grep("Ivoire", respond_df$country)] <- "Cote d'Ivoire"
respond_df$country[grep("ncipe", respond_df$country)] <- "Sao Tome and Principe"
# Switch to 3-letter code for country
respond_df <- respond_df %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
mutate(country_id = if_else(country=="Micronesia", "FSM", country_id)) %>%
dplyr::select(-country)
write.csv(respond_df,
file = "saved/ghsi_respond_df.csv",
row.names = FALSE)
rm(respond_df)
health_df <- read_csv('raw/health_GHSI.csv') %>%
mutate(country = recode(country,
`South Korea` = 'Korea, Rep.',
`Congo (Democratic Republic)` = 'Congo, Dem. Rep.',
`Congo (Brazzaville)` = 'Congo, Rep.',
`Egypt` = 'Egypt, Arab Rep.',
`Iran` = 'Iran, Islamic Rep.',
`Russia` = 'Russian Federation',
`Venezuela` = 'Venezuela, RB',
`Syria` = 'Syrian Arab Republic',
`Laos` = 'Lao PDR',
`Gambia` = 'Gambia, The',
`Slovakia` = 'Slovak Republic',
`eSwatini (Swaziland)` = 'Eswatini'
))
health_df$country[grep("Ivoire", health_df$country)] <- "Cote d'Ivoire"
health_df$country[grep("ncipe", health_df$country)] <- "Sao Tome and Principe"
# Switch to 3-letter code for country
health_df <- health_df %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
mutate(country_id = if_else(country=="Micronesia", "FSM", country_id)) %>%
dplyr::select(-country)
write.csv(health_df,
file = "saved/ghsi_health_df.csv",
row.names = FALSE)
rm(health_df)
Data comes from the International Labor Organization, but there is considerable variation in the moment at which the information was collected: ranging from 2001 for Haiti, to 2011 for Canada or Chile.
url <- 'http://www.social-protection.org/gimi/gess/RessourceDownload.action?ressource.ressourceId=37218'
download.file(url,
destfile = "./raw/ILO_raw_health_coverage.xlsx",
mode="wb")
rm(url)
coverageDF <- read_excel(path = "./raw/ILO_raw_health_coverage.xlsx",
col_names = TRUE,
sheet = 1,
skip = 1)
coverageDF <- coverageDF %>%
slice(2:n()) %>%
dplyr::select(`Major area, region or country`, `Total coverage`, `Year...6`) %>%
rename(country = 1,
coverage = 2,
year = 3) %>%
mutate(country = gsub('[[:digit:]]+', '', country),
country = str_trim(country),
coverage = if_else(coverage=="…", NA_character_, coverage),
coverage = if_else(coverage=="...", NA_character_, coverage),
coverage = if_else(country %in% c("Malaysia", "Brunei Darussalam",
"Hong Kong Special Administrative Region"),
"100", coverage),
coverage = as.numeric(coverage)) %>%
filter(!(country %in% c("Africa","Americas","Asia","Europe",
"Oceania and the pacific", "Serbia and Montenegro"))) %>%
mutate(country = recode(country,
`Andora` = "Andorra")) %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
dplyr::select(-country, -year) %>%
rename(share_health_ins = coverage) %>%
as.data.frame()
write.csv(coverageDF,
file = "saved/share_health_ins.csv",
row.names = FALSE)
rm(coverageDF)
# Cleanup
file.remove("./raw/ILO_raw_health_coverage.xlsx")
The data is sourced from the Center for Systemic Peace.
download.file("http://www.systemicpeace.org/inscr/SFIv2018.xls",
"./raw/State_Fragility_2018.xls",
mode = "wb",
quiet = TRUE)
fragility_index <- read_excel(path = "./raw/State_Fragility_2018.xls",
col_names = TRUE) %>%
filter(year==2018) %>%
mutate(country = recode(country,
"Korea, South" = "Korea, Rep.",
"Dem. Rep. of Congo" = "Congo, Dem. Rep.",
"Congo-Brazzaville" = "Congo, Rep.",
"Egypt" = "Egypt, Arab Rep.",
"Iran" = "Iran, Islamic Rep.",
"Russia" = "Russian Federation",
"Venezuela" = "Venezuela, RB",
"Syria" = "Syrian Arab Republic",
"Kyrgyzstan" = "Kyrgyz Republic",
"Laos" = "Lao PDR",
"Macedonia" = "North Macedonia",
"Timor Leste" = "Timor-Leste",
"Sudan (North)" = "Sudan",
"Gambia" = "Gambia, The"))
# Switch to 3-letter country code
fragility_index <- fragility_index %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c"),
country_id = if_else(country=="Kosovo", "RKS", country_id)) %>%
rename(state_fragility = sfi) %>%
dplyr::select(-c(country))
write.csv(fragility_index,
file = "saved/state_fragility_df.csv",
row.names = FALSE)
rm(fragility_index)
# Cleanup
file.remove("./raw/State_Fragility_2018.xls")
[1] TRUE
The data is sourced from the International Conflict Research group, at ETH Zurich.
eprCoreDF <- read.csv(url("https://icr.ethz.ch/data/epr/core/EPR-2019.csv"),
header = TRUE)
# Basic cleanup
eprCoreDF <- eprCoreDF %>%
dplyr::select(gwid, statename, to, group, size, status) %>%
rename(country = statename,
year = to) %>%
group_by(country) %>%
filter(year==max(year)) %>%
dplyr::select(-year) %>%
ungroup()
# From this, we can easily compute a Herfindahl concentration formula
# Create a residual group that brings up population to 100%
elfDF <- eprCoreDF %>%
mutate(country = as.character(country),
group = as.character(group),
status = as.character(status)) %>%
group_by(gwid) %>%
do(add_row(., .after = 0)) %>%
ungroup() %>%
mutate(gwid = na.locf(gwid, fromLast = TRUE),
country = na.locf(country, fromLast = TRUE),
group = if_else(is.na(group), "Rest of country", group),
status = if_else(is.na(status), "ARTIFICIALLY ADDED", status)) %>%
group_by(country) %>%
mutate(size = if_else(is.na(size), 1-sum(size, na.rm = TRUE), size)) %>%
dplyr::select(-gwid) %>%
summarise(elf_epr = 1 - sum(size^2),
rq_polarization = 4*sum(size^2*(1-size)),
count_powerless = sum(status=="POWERLESS"),
share_powerless = sum(size[status=="POWERLESS"]))
# Do a further small recoding to get an ad-hoc index of inclusion.
elfDF <- elfDF %>%
filter(!(country %in% c("Germany Democratic Republic",
"Czechoslovakia",
"Republic of Vietnam",
"Serbia and Montenegro",
"Zanzibar",
"Yemen People's Republic"))) %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
mutate(country_id = if_else(country=="Kosovo", "RKS", country_id),
country_id = if_else(country=="Tibet", "TIB", country_id)) %>%
dplyr::select(-country)
write.csv(elfDF,
file = "./saved/eps_df.csv",
row.names = FALSE)
rm(elfDF)
The data provides information of the prevalence of respiratory diseases in the population, and it was retrieved from http://ghdx.healthdata.org/gbd-2017. According to the terms and conditions for the use of data (http://www.healthdata.org/about/terms-and-conditions) it can be redistributed.
gbd17 <- read_csv('raw/gdb_respiratory.csv') %>%
filter(measure_name == 'Prevalence') %>%
filter(str_detect(cause_name, 'respiratory')) %>%
filter(metric_name == 'Percent') %>%
group_by(location_name) %>%
summarise(val = sum(val, na.rm = T)) %>%
mutate(val = val * 100) %>%
rename(country = location_name, resp_disease_prev = val) %>%
mutate(country = recode(country,
"Democratic Republic of the Congo" = "Congo, Dem. Rep.",
"South Korea" = "Korea, Rep.",
"Syria" = "Syrian Arab Republic",
"Venezuela" = "Venezuela, RB",
"Slovakia" = "Slovak Republic",
"Egypt" = "Egypt, Arab Rep.",
"Iran" = "Iran, Islamic Rep.",
"Russia" = "Russian Federation",
"The Gambia" = "Gambia, The",
"Congo" = "Congo, Rep.",
"Kyrgyzstan" = "Kyrgyz Republic",
"Macedonia" = "North Macedonia"))
# Switch to 3-letter country code
gbd17 <- gbd17 %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
dplyr::select(-country)
write.csv(gbd17,
file = "saved/respiratory_disease_df.csv",
row.names = FALSE)
rm(gbd17)
The information about MERS infections is obtained from the following source. The data is compiled by the World Health Organization’s MERS Global Summary and Assessment of Risk.
#### SARS ####
webpage <- read_html("https://www.who.int/csr/sars/country/table2004_04_21/en/")
# See how many tables are in the page
tbls <- html_nodes(webpage, "table")
head(tbls)
rm(tbls)
# We only need the first table there
tbls_ls <- webpage %>%
html_nodes("table") %>%
.[1] %>%
html_table(fill = TRUE)
# Table needs some cleaning
sarsDF <- as.data.frame(tbls_ls[[1]]); rm(tbls_ls)
sarsDF <- sarsDF %>%
slice(-(1:2)) %>%
dplyr::select(X1, X4, X6) %>%
rename(country = X1,
infections = X4,
deaths = X6) %>%
mutate(infections = str_remove(infections, fixed("^b")),
infections = str_remove(infections, fixed("^c")),
infections = as.numeric(infections),
deaths = as.numeric(deaths)) %>%
slice(1:(n()-2))
rm(webpage)
write.csv(sarsDF,
file = "./raw/sars_df.csv",
row.names = FALSE)
#### MERS ####
# Have to read the "raw" data directly from the hard drive
# Information on deaths is only found on Wikipedia, but this
# was considered unreliable
mersDF <- read.csv(file = "./raw/WHO-MERS.csv",
header = TRUE)
#### Ebola ####
ebolaDF <- read.csv(url("https://data.humdata.org/dataset/0d089fa0-3567-4b01-9c03-39d340ff34e3/resource/c59b5722-ca4b-41ca-a446-472d6d824d01/download/ebola_data_db_format.csv"),
header = TRUE)
ebolaDF <- ebolaDF %>%
mutate(Indicator = as.character(Indicator),
Country = as.character(Country),
Date = as.character(Date),
value = as.numeric(value)) %>%
filter(Indicator %in% c("Cumulative number of confirmed, probable and suspected Ebola cases",
"Cumulative number of confirmed Ebola cases",
"Cumulative number of confirmed, probable and suspected Ebola deaths",
"Cumulative number of confirmed Ebola deaths")) %>%
filter(Date=="2016-03-23")
# Clean up country names
ebolaDF <- ebolaDF %>%
dplyr::select(-Date) %>%
mutate(Country = recode(Country,
"Liberia 2" = "Liberia",
"Guinea 2" = "Guinea")) %>%
group_by(Country, Indicator) %>%
summarise(value = sum(value)) %>%
ungroup()
# Recoding
ebolaDF <- ebolaDF %>%
mutate(Indicator = recode(Indicator,
"Cumulative number of confirmed Ebola cases" = "confirmedInfected",
"Cumulative number of confirmed Ebola deaths" = "confirmedDeaths",
"Cumulative number of confirmed, probable and suspected Ebola cases" = "totalInfected",
"Cumulative number of confirmed, probable and suspected Ebola deaths" = "totalDeaths")) %>%
pivot_wider(id_cols = Country, names_from = Indicator, values_from = value) %>%
dplyr::select(Country, totalInfected, totalDeaths) %>%
rename(country = Country,
infections = totalInfected,
deaths = totalDeaths)
write.csv(ebolaDF,
file = "./raw/ebola_df.csv",
row.names = FALSE)
#### Merge the 3 data sets ####
mersDF <- mersDF %>%
mutate(country = as.character(country),
deaths = NA) %>%
rename(infections_mers = infections,
deathsMERS = deaths)
sarsDF <- sarsDF %>%
mutate(country = as.character(country)) %>%
rename(infections_sars = infections,
deathsSARS = deaths) %>%
mutate(country = recode(country,
"United States" = "United States of America"))
ebolaDF <- ebolaDF %>%
mutate(country = as.character(country)) %>%
rename(infections_ebola = infections,
deathsEbola = deaths)
masterDF <- data.frame(country = unique(c(mersDF$country,
sarsDF$country,
ebolaDF$country)))
# Merging
masterDF <- left_join(masterDF, mersDF, by = "country") %>%
left_join(sarsDF, by = "country") %>%
left_join(ebolaDF, by = "country")
# Harmonize country names
masterDF <- masterDF %>%
mutate(country = recode(country,
"Republic of Korea" = "South_Korea",
"Saudi Arabia" = "Saudi_Arabia",
"United Kingdom" = "United_Kingdom",
"United Arab Emirates" = "United_Arab_Emirates",
"United States of America" = "United_States_of_America",
"New Zealand" = "New_Zealand",
"Republic of Ireland" = "Ireland",
"Russian Federation" = "Russia",
"South Africa" = "South_Africa",
"Viet Nam" = "Vietnam",
"Sierra Leone" = "Sierra_Leone")) %>%
dplyr::select(country, infections_mers, infections_sars, infections_ebola) %>%
mutate(infection = 0,
infection = if_else(!is.na(infections_mers) & infections_mers>100, 1, infection),
infection = if_else(!is.na(infections_sars) & infections_sars>100, 1, infection),
infection = if_else(!is.na(infections_ebola) & infections_ebola>100, 1, infection)) %>%
# Switch to 3-letter country code
mutate(country = recode(country,
"South_Korea" = "South Korea",
"United_States_of_America" = "United States")) %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
dplyr::select(-country)
write.csv(masterDF,
file = "./saved/past_epidemics_df.csv",
row.names = FALSE)
rm(masterDF)
Data sourced from the Climatic Research Unit of the University of East Anglia.
#### TEMPERATURE DATA ####
# Measured in degrees Celsius
# Extract links from website, so as to be able to use them for data reading
page <- read_html("https://crudata.uea.ac.uk/cru/data/hrg/cru_ts_4.04/crucy.2004161557.v4.04/countries/tmp/")
linksVEC <- page %>%
html_nodes("a") %>%
html_attr("href")
linksVEC <- linksVEC[7:length(linksVEC)]
rm(page)
# Give full path for each link
linksVEC <- paste0("https://crudata.uea.ac.uk/cru/data/hrg/cru_ts_4.04/crucy.2004161557.v4.04/countries/tmp/",
linksVEC)
# Extract names of locations
namesVEC <- regmatches(linksVEC,
regexpr("2019.\\s*\\K.*?(?=\\s*.tmp)", linksVEC, perl=TRUE))
# Custom function to compute a rolling mean without the current
# observation
.roll_without <- function(x, n) {
# x is the vector to be transformed, while n is the count of
# rows that need averaged
require(zoo)
xmean <- rollmeanr(x, n+1)
xmean <- c(rep(NA, times = (length(x) - length(xmean))),
xmean)
xmeanright <- (xmean*(n+1) - x)/n
return(xmeanright)
}
.extract_weather <- function(x) {
# Produce list in which data sets are to be saved
tempLIST <- list()
for(i in 1:length(x)) {
download.file(x[i],
"./raw/weatherTemp.per",
mode = "wb",
quiet = TRUE)
per <- fread(file = "./raw/weatherTemp.per")
per <- per %>%
dplyr::select(YEAR:DEC) %>%
pivot_longer(cols = JAN:DEC,
names_to = "month",
values_to = "temp") %>%
filter(YEAR %in% c(2017,2018,2019)) %>%
add_row(YEAR = rep(2020, times = 12),
month = c("JAN","FEB","MAR","APR","MAY","JUN",
"JUL","AUG","SEP","OCT","NOV","DEC"),
temp = rep(0, times = 12)) %>%
group_by(month) %>%
mutate(temp = as.numeric(temp),
temp = if_else(temp==-999, NA_real_, temp),
temp_mean = .roll_without(temp, 3)) %>%
ungroup() %>%
filter(!is.na(temp_mean)) %>%
dplyr::select(-temp) %>%
add_row(YEAR = rep(2021, times = 12),
month = c("JAN","FEB","MAR","APR","MAY","JUN",
"JUL","AUG","SEP","OCT","NOV","DEC"),
temp_mean = .$temp_mean) %>% # Duplicates temperatures here
mutate(country = namesVEC[i])
# Compute cumulative mean, modulo 12
per$temp_cumul <- cumsum(per$temp_mean)/(1:length(per$temp_mean))
tempLIST[[i]] <- dplyr::select(per, -temp_mean)
}
bind_rows(tempLIST)
}
temperatureDF <- .extract_weather(linksVEC)
# Cleanup
file.remove("./raw/weatherTemp.per")
#### PRECIPITATION DATA ####
# Measured in mm/month
# Extract links from website, so as to be able to use them for data reading
page <- read_html("https://crudata.uea.ac.uk/cru/data/hrg/cru_ts_4.04/crucy.2004161557.v4.04/countries/pre/")
linksVEC <- page %>%
html_nodes("a") %>%
html_attr("href")
linksVEC <- linksVEC[7:length(linksVEC)]
rm(page)
# Give full path for each link
linksVEC <- paste0("https://crudata.uea.ac.uk/cru/data/hrg/cru_ts_4.04/crucy.2004161557.v4.04/countries/pre/",
linksVEC)
# Extract names of locations
namesVEC <- regmatches(linksVEC,
regexpr("2019.\\s*\\K.*?(?=\\s*.pre)", linksVEC, perl=TRUE))
precipitationDF <- .extract_weather(linksVEC)
# Cleanup
file.remove("./raw/weatherTemp.per")
# Change name of variable
precipitationDF <- precipitationDF %>%
rename(precip_cumul = temp_cumul)
#### MERGE AND CLEAN UP ####
weatherDF <- left_join(temperatureDF,
precipitationDF,
by = c("country","YEAR","month"))
rm(temperatureDF, precipitationDF)
write.csv(weatherDF,
file = "./raw/weather_data_East_Anglia.csv",
row.names = FALSE)
rm(weatherDF)
weatherDF <- read.csv(file = "./raw/weather_data_East_Anglia.csv",
header = TRUE)
# Biggest challenge are country names
weatherDF <- weatherDF %>%
filter(!(country=="all")) %>%
mutate(country = recode(country,
"Bosnia-Herzegovinia" = "Bosnia_and_Herzegovina",
"Brunei" = "Brunei_Darussalam",
"Cape_Verde_Isl" = "Cape_Verde",
"Central_African_Rep" = "Central_African_Republic",
"Curacao_Isl" = "Curacao",
"DR_Congo" = "Democratic_Republic_of_the_Congo",
"East_Timor" = "Timor_Leste",
"Faeroes" = "Faroe_Islands",
"Falkland_Isl" = "Falkland_Islands_(Malvinas)",
"Guinea-Bissau" = "Guinea_Bissau",
"Macedonia" = "North_Macedonia",
"Puerto_Rica" = "Puerto_Rico",
"Sao_Tome_+_Principe" = "Sao_Tome_and_Principe",
"St_Kitts_and_Nevis" = "Saint_Kitts_and_Nevis",
"St_Lucia" = "Saint_Lucia",
"St_Vincent" = "Saint_Vincent_and_the_Grenadines",
"Swaziland" = "Eswatini",
"Tanzania" = "United_Republic_of_Tanzania",
"USA" = "United_States_of_America",
"Virgin_Isl" = "United_States_Virgin_Islands",
"Palau_Isl" = "Palau"))%>%
rename(year = YEAR) %>%
mutate(month = recode(month,
"DEC" = "12",
"JAN" = "01",
"FEB" = "02",
"MAR" = "03",
"APR" = "04",
"MAY" = "05",
"JUN" = "06",
"JUL" = "07",
"AUG" = "08",
"SEP" = "09",
"OCT" = "10",
"NOV" = "11"),
year = as.character(year))
# Switch to a 3-letter country code
weatherDF <- weatherDF %>%
mutate(country = str_replace_all(country, "_", " ")) %>%
filter(!(country=="Bassas da India")) %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
mutate(country_id = if_else(country=="Eswatini", "SWZ", country_id),
country_id = if_else(country=="Guadalupe", "GLP", country_id),
country_id = if_else(country=="Kosovo", "RKS", country_id),
country_id = if_else(country=="Vanatu", "VUT", country_id)) %>%
dplyr::select(-country)
write.csv(weatherDF,
file = "saved/weather_df.csv",
row.names = FALSE)
rm(weatherDF)
The V-Dem data is produced by the V-Dem Institute, part of the Department of Political Science, University of Gothenburg, Sweden. We source from the V-Dem data indicators related to the media landscape, equality of the health system, accountability, the rule of law, the extent of societal polarization, as well as the partisanship of the government.
# Read in V-Dem data
vdemDF <- vdem
# Subset of V-Dem data relevant for our analysis
# "v2meaccess" is no longer offered in the data as an indicator
# We force a cutoff at 2019 to ensure that if the V-Dem data is
# updated we don't use more recent observations to predict Covid-19
# mortality in 2019.
vdemDF <- vdemDF %>%
group_by(country_name) %>%
filter(year<=2019) %>%
filter(year==max(year, na.rm = TRUE)) %>%
ungroup() %>%
dplyr::select(country_name, v2mecrit, v2meharjrn,
v2pehealth, v2xcl_prpty, v2cltrnslw,
v2x_pubcorr, v2exl_legitideolcr_1,
v2cacamps) %>%
mutate(country_id = countrycode(country_name,
origin = "country.name",
destination = "iso3c")) %>%
mutate(country_id = if_else(country_name=="Eswatini", "SWZ", country_id),
country_id = if_else(country_name=="Kosovo", "RKS", country_id)) %>%
dplyr::select(-country_name) %>%
filter(!is.na(country_id)) %>%
rename(
media_critical = v2mecrit,
journal_harass = v2meharjrn,
health_equality = v2pehealth,
property_rights = v2xcl_prpty,
transparent_law = v2cltrnslw,
bureaucracy_corrupt = v2x_pubcorr,
pos_gov_lr = v2exl_legitideolcr_1,
polar_rile = v2cacamps)
# Palestinian territories appearing multiple times: (1) British Mandate;
# (2) Gaza Strip; (3) West Bank
# Somalia appearing only once, and was kept in the data set
vdemDF <- filter(vdemDF, !(country_id %in% c("PSE")))
# Write file in saved folder to merge
write.csv(vdemDF,
file = "./saved/vdem_indexes_df.csv",
row.names = FALSE)
rm(vdemDF)
Data is obtained from the Manifesto Project (MARPOR). For downloading data, please first set an API key in text format, which will allow you to download the data through the manifestoR
package. To get this API key, create a personal account at the project web page: https://manifesto-project.wzb.eu/. After logging in, on your Profile page, you can generate a personalized API with the click of a button. Save this API in a text folder called manifesto_apikey.txt
in your working directory. To activate the code chunk below and download the MARPOR data set eval
to TRUE
.
Update [February 2021]: Due to the unavailability of this indicator for the majority of countries in our sample, we replaced it with a proxy from the V-DEM data: v2cacamps
- the extent to which society is polarized into antagonistic, political camps (e.g. reluctant to engage in friendly interactions). High values on this variable denote a more polarized environment.
manifestoR::mp_setapikey("manifesto_apikey.txt")
CMP <- mp_maindataset(version = "MPDS2019b",
download_format = NULL,
apikey = NULL)
# Select latest year from each country
cmpDF <- CMP %>%
mutate(elect.yr = as.numeric(str_sub(edate, start = 1, end = 4))) %>%
group_by(countryname) %>%
filter(elect.yr==max(elect.yr, na.rm = TRUE)) %>%
filter(elect.yr >= 2015) %>%
ungroup() %>%
dplyr::select(countryname, elect.yr, partyname, party,
per503,per504,per505,per403,per404,per412,
per401,per414,per601,per602,per603,per604,
per605,per104,per201,per203,per305,per402,
per407,per606,per103,per105,per106,per107,
per406,per413,per506,per701,per702,per202,
per604,per704,pervote)
rm(CMP)
# Load the South American version of the data set, and try to extract the same
# variables from it.
cmp_SA <- mp_maindataset(version = "MPDSSA2019b",
download_format = NULL,
south_america = TRUE,
apikey = NULL)
cmpsaDF <- cmp_SA %>%
mutate(elect.yr = as.numeric(str_sub(edate, start = 1, end = 4))) %>%
group_by(countryname) %>%
filter(elect.yr==max(elect.yr, na.rm = TRUE)) %>%
filter(elect.yr >= 2015) %>%
ungroup() %>%
dplyr::select(countryname, elect.yr, partyname, party,
pervote,per503,per504,per505,per403,per404,
per412,per401,per414,per601_1,per601_2,
per602_1,per602_2,per603,per604,per605_1,
per104,per201_1,per201_2,per203,per305_1,
per305_2,per305_3,per305_4,per305_5,per305_6,
per402,per407,per606_1,per606_2,per103_1,
per103_2,per105,per106,per107,per406,
per413,per506,per701,per702,per202_1,
per202_3,per202_4,per604,per704) %>%
mutate(per601 = per601_1 + per601_2,
per602 = per602_1 + per602_2,
per201 = per201_1 + per201_2,
per305 = per305_1 + per305_2 + per305_3 +
per305_4 + per305_5 + per305_6,
per606 = per606_1 + per606_2,
per103 = per103_1 + per103_2,
per202 = per202_1 + per202_3 + per202_4) %>%
select(-c(per601_1, per601_2, per602_1, per602_2,
per201_1, per201_2, per305_1, per305_2,
per305_3, per305_4, per305_5, per305_6,
per606_1, per606_2, per103_1, per103_2,
per202_1, per202_3, per202_4)) %>%
rename(per605 = per605_1)
rm(cmp_SA)
# Arrange columns to match those of the main CMP data set
cmpsaDF <- cmpsaDF[ ,names(cmpDF)]
cmpDF <- rbind(cmpDF, cmpsaDF); rm(cmpsaDF)
cmpDF <- cmpDF %>%
mutate_at(vars(per503:pervote), as.numeric)
# Examine which rows have a lot of NAs
as.data.frame(cmpDF[rowSums(is.na(cmpDF)) >= 5, 1:10])
# Remove the 8 parties listed, as some of them have not had
# their manifestos coded by the MARPOR project yet. A curious case is,
# for example "The River" (34340), which has a NA observation, as well
# as a second one, with full data availability
cmpDF <- cmpDF %>%
filter(!(countryname=="Denmark" & party==13410 & elect.yr==2015)) %>%
filter(!(countryname=="Iceland" & party==15630 & elect.yr==2017)) %>%
filter(!(countryname=="Greece" & party==34213 & elect.yr==2015)) %>%
filter(!(countryname=="Greece" & party==34340 & elect.yr==2015 & is.na(per503))) %>%
filter(!(countryname=="Greece" & party==34720 & elect.yr==2015 & is.na(per503))) %>%
filter(!(countryname=="Greece" & party==34730 & elect.yr==2015 & is.na(per503))) %>%
filter(!(countryname=="Croatia" & party==81022 & elect.yr==2016)) %>%
filter(!(countryname=="Slovakia" & party==96423 & elect.yr==2016))
# This follows the recommendation in Lowe et al. (2011), and is the
# same procedure used in Barth et al. (2015)
cmpDF <- cmpDF %>%
mutate(rileemph = log(per104 + 0.5) + log(per201 + 0.5) + log(per203 + 0.5) +
log(per305 + 0.5) + log(per401 + 0.5) + log(per402 + 0.5) + log(per407 + 0.5) +
log(per414 + 0.5) + log(per505 + 0.5) + log(per601 + 0.5) + log(per603 + 0.5) +
log(per605 + 0.5) + log(per606 + 0.5) + log(per702 + 0.5) - log(per103 + 0.5) -
log(per105 + 0.5) - log(per106 + 0.5) - log(per107 + 0.5) - log(per403 + 0.5) -
log(per404 + 0.5) - log(per406 + 0.5) - log(per412 + 0.5) - log(per413 + 0.5) -
log(per504 + 0.5) - log(per506 + 0.5) - log(per701 + 0.5) - log(per202 + 0.5) -
log(per604 + 0.5)) %>%
dplyr::select(countryname, elect.yr, partyname, party, pervote,
rileemph) %>%
rename(country = countryname,
year = elect.yr)
# Add missing vote shares
# http://electionresources.org/br/president.php?election=2018
cmpDF$pervote[cmpDF$country=="Brazil" & cmpDF$party==180230] <- 29.28
cmpDF$pervote[cmpDF$country=="Brazil" & cmpDF$party==180251] <- 12.47
cmpDF$pervote[cmpDF$country=="Brazil" & cmpDF$party==180310] <- 4.76
cmpDF$pervote[cmpDF$country=="Brazil" & cmpDF$party==180620] <- 46.03
# http://electionresources.org/cl/deputies.php?election=2017
cmpDF$pervote[cmpDF$country=="Chile" & cmpDF$party==155980] <- 1.75
# http://www.dik.co.me/izbori 2016/Konacni rezultati.pdf (only through WebArchive)
cmpDF$pervote[cmpDF$country=="Montenegro" & cmpDF$party==91610] <- 0.40
# Can't identify the second party with a missing vote share in Montenegro
# Have to adjust the vote share for one of the parties in North Macedonia
cmpDF$pervote[cmpDF$country=="North Macedonia" & cmpDF$party==89221] <- 37.87
# The other Macedonian party can't be identified
# The Spanish party with a missing vote share can't be identified
polarDF <- cmpDF %>%
na.omit() %>%
group_by(country) %>%
summarise(polar_rile = sum(((rileemph - mean(rileemph, na.rm = TRUE))^2)*(pervote/100)))
rm(cmpDF)
# Correct country names (to be used in merging)
polarDF <- polarDF %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
dplyr::select(-country)
write.csv(polarDF,
file = "./saved/polarization_df.csv",
row.names = FALSE)
rm(polarDF)
Due to restrictions on redistributing the World Values Surveys (WVS) and AfroBarometer (AB) data, we supply here only the code needed to start from the raw WVS and AB (individual-level data) and produce country-level summaries of inter-personal trust. For users who would like to generate their own country-level summaries, please follow the following steps:
R-data
versions of these files in the ./raw
subfolder, using the file names provided in the read-wvs-ab-data
chunk below.At this point, the code in the prepare-wvs-data
chunk below should work.
# Read WVS waves 1-6 merged data
wvsDF <- readRDS(file = "./raw/WVS_1981_2016_r_v20180912.rds")
wvs16sumDF <- wvsDF %>%
dplyr::select(S003, A165, S017, S020) %>%
rename(country = S003,
trust = A165,
weight = S017,
year = S020) %>%
mutate(trust = case_when(trust %in% c(-5,-2,-1) ~ NA_real_,
trust==1 ~ 1,
trust==2 ~ 0)) %>%
na.omit() %>%
filter(year>=2009) %>%
group_by(country, year) %>%
summarise(trust_people = weighted.mean(trust, weight)*100) %>%
ungroup() %>%
mutate(country_id = countrycode(country,
origin = "wvs",
destination = "iso3c")) %>%
dplyr::select(-c(country)) %>%
dplyr::select(trust_people, country_id, year)
rm(wvsDF)
# Read AfroBarometer wave 5 merged data
abDF <- read.spss(file = "./raw/merged_r5_data_Jul2015.sav",
to.data.frame = TRUE,
use.value.lables = TRUE)
absumDF <- abDF %>%
dplyr::select(COUNTRY, Q87, withinwt, DATEINTR) %>%
mutate(COUNTRY = as.character(COUNTRY),
Q87 = as.character(Q87),
COUNTRY = if_else(COUNTRY=="Cote d’Ivoire", "Cote d'Ivoire", COUNTRY)) %>%
mutate(fieldwork = as.POSIXct(DATEINTR, origin="1582-10-14", tz="GMT"),
year = format(fieldwork, "%Y")) %>%
dplyr::select(-c(fieldwork, DATEINTR)) %>%
rename(country = COUNTRY,
trust = Q87,
weight = withinwt) %>%
mutate(trust = case_when(trust %in% c("Don't know","Missing") ~ NA_real_,
trust=="Must be very careful" ~ 0,
trust=="Most people can be trusted" ~ 1),
trust = as.numeric(trust)) %>%
na.omit() %>%
group_by(country) %>%
summarise(trust_people = weighted.mean(trust, weight)*100,
year = .mode(year)) %>%
ungroup() %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
dplyr::select(-country) %>%
dplyr::select(trust_people, country_id, year)
rm(abDF)
# Read WVS wave 7 merged data
load("./raw/WVS_Cross-National_Wave_7_R_v1_4.rdata")
wvs7sumDF <- `WVS_Cross-National_Wave_7_R_v1_4` %>%
dplyr::select(C_COW_NUM, Q57, W_WEIGHT, A_YEAR) %>%
rename(country = C_COW_NUM,
trust = Q57,
weight = W_WEIGHT,
year = A_YEAR) %>%
mutate(trust = case_when(trust %in% c(-5,-4,-2,-1) ~ NA_real_,
trust==1 ~ 1,
trust==2 ~ 0)) %>%
na.omit() %>%
group_by(country, year) %>%
summarise(trust_people = weighted.mean(trust, weight)*100) %>%
ungroup() %>%
mutate(country_id = countrycode(country,
origin = "cown",
destination = "iso3c")) %>%
mutate(country_id = if_else(country==6, "PRI", country_id),
country_id = if_else(country==446, "MCU", country_id),
country_id = if_else(country==714, "HKG", country_id),
country_id = if_else(country==345, "SRB", country_id)) %>%
dplyr::select(-c(country)) %>%
dplyr::select(trust_people, country_id, year)
rm(`WVS_Cross-National_Wave_7_R_v1_4`)
trust1DF <- rbind(wvs16sumDF, wvs7sumDF, absumDF)
rm(wvs16sumDF, wvs7sumDF, absumDF)
# For each country, select the most recent version of the data. Two
# surveys are available for Ghana in 2012. Because of this, we
# averaged the two values.
trust1DF <- trust1DF %>%
group_by(country_id) %>%
filter(year==max(year)) %>%
summarise(trust_people = mean(trust_people)) %>%
ungroup() %>%
as.data.frame()
# Add a few missing values from the "Our World in Data" data set
# https://ourworldindata.org/grapher/self-reported-trust-attitudes?tab=table&time=2014
trust1DF[nrow(trust1DF) + 1,] <- c("BHR", 33.50) # 2014
trust1DF[nrow(trust1DF) + 1,] <- c("BGR", 19.44) # 2009
trust1DF[nrow(trust1DF) + 1,] <- c("CAN", 41.11) # 2009
trust1DF[nrow(trust1DF) + 1,] <- c("FIN", 57.99) # 2009
trust1DF[nrow(trust1DF) + 1,] <- c("FRA", 18.66) # 2009
trust1DF[nrow(trust1DF) + 1,] <- c("ITA", 28.25) # 2009
trust1DF[nrow(trust1DF) + 1,] <- c("MDA", 17.59) # 2009
trust1DF[nrow(trust1DF) + 1,] <- c("NOR", 73.73) # 2009
trust1DF[nrow(trust1DF) + 1,] <- c("CHE", 49.43) # 2009
trust1DF[nrow(trust1DF) + 1,] <- c("GBR", 29.96) # 2009
write.csv(trust1DF,
file = "./saved/trust_people_df.csv",
row.names = FALSE)
rm(trust1DF)
Due to similar restrictions on redistributing the WVS data, we follow the same strategy as in the previous section. For this indicator, please also download the Latin Barometer merged 2018 wave, version from March 3, 2019, and place the .rds
version of the file in the ./raw
subfolder.
# Read WVS waves 1-6 merged data
wvsDF <- readRDS(file = "./raw/WVS_1981_2016_r_v20180912.rds")
# The item (E069_11) gives respondents a 4-point scale for self-rating their
# confidence in the government: 1=a great deal; 2=quite a lot; 3=not very
# much; 4=none at all.
# The variable we use in the models represents the share of respondents that
# reported trusting the government "a great deal" or "quite a lot"
wvs16sumDF <- wvsDF %>%
dplyr::select(S003, E069_11, S017, S020) %>%
rename(country = S003,
trust_gov = E069_11,
weight = S017,
year = S020) %>%
mutate(trust_gov = as.numeric(trust_gov),
trust_gov = case_when(trust_gov <= -1 ~ NA_real_,
trust_gov %in% c(3,4) ~ 0,
trust_gov %in% c(1,2) ~ 1)) %>%
na.omit() %>%
filter(year>=2009) %>%
group_by(country, year) %>%
summarise(trust_gov = weighted.mean(trust_gov, weight)*100) %>%
ungroup() %>%
mutate(country_id = countrycode(country,
origin = "wvs",
destination = "iso3c")) %>%
dplyr::select(-country) %>%
dplyr::select(trust_gov, country_id, year)
rm(wvsDF)
# Read WVS wave 7 merged data
load("./raw/WVS_Cross-National_Wave_7_R_v1_4.rdata")
wvs7sumDF <- `WVS_Cross-National_Wave_7_R_v1_4` %>%
dplyr::select(C_COW_NUM, Q71, W_WEIGHT, A_YEAR) %>%
rename(country = C_COW_NUM,
trust_gov = Q71,
weight = W_WEIGHT,
year = A_YEAR) %>%
mutate(trust_gov = as.numeric(trust_gov),
trust_gov = case_when(trust_gov <= -1 ~ NA_real_,
trust_gov %in% c(3,4) ~ 0,
trust_gov %in% c(1,2) ~ 1)) %>%
na.omit() %>%
group_by(country, year) %>%
summarise(trust_gov = weighted.mean(trust_gov, weight)*100) %>%
ungroup() %>%
mutate(country_id = countrycode(country,
origin = "cown",
destination = "iso3c")) %>%
mutate(country_id = if_else(country==6, "PRI", country_id),
country_id = if_else(country==446, "MCU", country_id),
country_id = if_else(country==714, "HKG", country_id),
country_id = if_else(country==345, "SRB", country_id)) %>%
dplyr::select(-c(country)) %>%
dplyr::select(trust_gov, country_id, year)
rm(`WVS_Cross-National_Wave_7_R_v1_4`)
# Add data from the most recent version of the Latin Barometer (2018)
latbaroDF <- readRDS(file = "./raw/Latinobarometro_2018_Esp_R_v20190303.rds")
lbsummDF <- latbaroDF %>%
dplyr::select(IDENPA, P15STGBSC.E, WT) %>%
rename(ccode = IDENPA,
trust_gov = P15STGBSC.E,
weight = WT) %>%
mutate(trust_gov = as.numeric(trust_gov),
trust_gov = case_when(trust_gov %in% c(-1,-2) ~ NA_real_,
trust_gov %in% c(3,4) ~ 0,
trust_gov %in% c(1,2) ~ 1)) %>%
na.omit() %>%
group_by(ccode) %>%
summarise(trust_gov = weighted.mean(trust_gov,
weight,
na.rm = TRUE)*100) %>%
mutate(year = 2018) %>%
ungroup() %>%
mutate(country_id = countrycode(ccode,
origin = "wvs",
destination = "iso3c")) %>%
dplyr::select(-ccode) %>%
dplyr::select(trust_gov, country_id, year)
rm(latbaroDF)
trust2DF <- rbind(wvs16sumDF, wvs7sumDF, lbsummDF)
rm(wvs16sumDF, wvs7sumDF, lbsummDF)
# For each country, select the most recent version of the data. Duplicate
# surveys can be found for Brazil, Chile, Colombia, Ecuador, Mexico, Peru.
# Because of this, we averaged the two values.
trust2DF <- trust2DF %>%
group_by(country_id) %>%
filter(year==max(year)) %>%
summarise(trust_gov = mean(trust_gov)) %>%
ungroup() %>%
as.data.frame()
write.csv(trust2DF,
"./saved/trust_gov_df.csv",
row.names = FALSE)
rm(trust2DF)
The data is produced by Jordan Kyle and Brett Meyer of the Tony Blair Institute for Global Change: https://institute.global/policy/high-tide-populism-power-1990-2020.
# Populism in power
# https://institute.global/policy/high-tide-populism-power-1990-2020
populism_df <- WDI::WDI_data$country %>%
data.frame() %>%
select(iso3c, country) %>%
mutate(electoral_pop = 1*(iso3c %in% c("USA", "MEX", "NIC", "VEN",
"BRA", "POL", "HUN", "CZE",
"ITA", "SRB", "BGR", "BLR",
"TUR", "ISR", "IND", "LKA",
"PHL"))) %>%
dplyr::select(iso3c, electoral_pop) %>%
rename(country_id = iso3c) %>%
mutate(country_id = if_else(country_id=="XKX", "RKS", country_id))
nrow(populism_df[populism_df$electoral_pop==1,]) == 17
[1] TRUE
write.csv(populism_df,
file = "./saved/populism_df.csv",
row.names = FALSE)
rm(populism_df)
Information is obtained from the Database of Political Institutions, compiled by Carlos Scartascini, Cesi Cruz, and Philip Keefer, and currently hosted by the Inter-American Development Bank: https://publications.iadb.org/en/database-political-institutions-2017-dpi2017.
download.file("https://publications.iadb.org/publications/english/document/The-Database-of-Political-Institutions-2017-(DPI2017).zip",
"./raw/The-Database-of-Political-Institutions-2017-(DPI2017).zip",
mode = "wb",
quiet = TRUE)
unzip(zipfile = "./raw/The-Database-of-Political-Institutions-2017-(DPI2017).zip",
files = "Database DPI2017/DPI2017.dta",
exdir="./raw")
file.remove("./raw/The-Database-of-Political-Institutions-2017-(DPI2017).zip")
dpiDF <- readstata13::read.dta13(file = "./raw/Database DPI2017/DPI2017.dta",
convert.factors = FALSE)
dpiDF <- dpiDF %>%
dplyr::select(countryname, year, auton, muni,
state, author, stconst, checks,
polariz, pr) %>%
rename(checksVeto = checks,
polarizVeto = polariz) %>%
filter(!is.na(year)) %>%
group_by(countryname) %>%
filter(year == max(year, na.rm = TRUE)) %>%
ungroup()
dpiDF <- dpiDF %>%
mutate_at(vars(auton:stconst),
funs(recode(., `0`=0, `1`=1, `2`=2, .default=NA_real_))) %>%
mutate(auton.std = .std_fun(auton),
muni.std = .std_fun(muni),
state.std = .std_fun(state),
author.std = .std_fun(author),
senate.std = .std_fun(stconst)) %>%
dplyr::select(-c(auton, muni, state, author, stconst))
# The last 2 items have 114 and 115 missing values out of 181 countries.
# Select only countries which have 3 valid observations for the measurements
dpiDF$missno <- apply(dpiDF[ ,6:10], 1, function(x) sum(is.na(x)))
fedDF <- dpiDF %>%
filter(missno <= 2) %>%
mutate(federal_ind = rowMeans(dplyr::select(.,ends_with(".std")),
na.rm = TRUE)) %>%
filter(year > 2015) %>%
dplyr::select(-c(year, pr:missno))
# Generate country code and clean country names
fedDF <- fedDF %>%
filter(!(countryname %in% c("Turk Cyprus","GDR","Yemen (PDR)",
"Yugoslavia","Soviet Union","Yemen (AR)"))) %>%
rename(country = countryname) %>%
mutate(country = recode(country,
"Cent. Af. Rep." = "Central African Republic",
"PRC" = "China",
"Dom. Rep." = "Dominican Republic",
"ROK" = "Republic of Korea",
"PRK" = "North Korea",
"S. Africa" = "South Africa",
"P. N. Guinea" = "Papua New Guinea")) %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
dplyr::select(-c(country, checksVeto, polarizVeto))
write.csv(fedDF,
file = "./saved/federalism_df.csv",
row.names = FALSE)
rm(fedDF)
# Follow the same procedure for checks and balances indicators
checksDF <- dpiDF %>%
dplyr::select(countryname, checksVeto, polarizVeto, year) %>%
filter(year > 2015) %>%
dplyr::select(-year)
# Generate country code and clean country names
checksDF <- checksDF %>%
filter(!(countryname %in% c("Turk Cyprus","GDR","Yemen (PDR)",
"Yugoslavia","Soviet Union","Yemen (AR)"))) %>%
rename(country = countryname) %>%
mutate(country = recode(country,
"Cent. Af. Rep." = "Central African Republic",
"PRC" = "China",
"Dom. Rep." = "Dominican Republic",
"ROK" = "Republic of Korea",
"PRK" = "North Korea",
"S. Africa" = "South Africa",
"P. N. Guinea" = "Papua New Guinea")) %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
dplyr::select(-c(country)) %>%
rename(checks_veto = checksVeto,
polariz_veto = polarizVeto) %>%
mutate(checks_veto = ifelse(checks_veto == -999, NA, checks_veto),
polariz_veto = ifelse(polariz_veto == -999, NA, checks_veto))
write.csv(checksDF,
file = "./saved/dpi_checks_df.csv",
row.names = FALSE)
rm(checksDF)
prDF <- dpiDF %>%
dplyr::select(countryname, pr)
# Generate country code and clean country names
prDF <- prDF %>%
filter(!(countryname %in% c("Turk Cyprus","GDR","Yemen (PDR)",
"Yugoslavia","Soviet Union","Yemen (AR)"))) %>%
rename(country = countryname) %>%
mutate(country = recode(country,
"Cent. Af. Rep." = "Central African Republic",
"PRC" = "China",
"Dom. Rep." = "Dominican Republic",
"ROK" = "Republic of Korea",
"PRK" = "North Korea",
"S. Africa" = "South Africa",
"P. N. Guinea" = "Papua New Guinea")) %>%
mutate(pr = as.numeric(pr),
pr = if_else(pr==-999, NA_real_, pr)) %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
dplyr::select(-c(country))
write.csv(prDF,
file = "./saved/pr_df.csv",
row.names = FALSE)
rm(prDF)
# Clean up DPI 2017 data
unlink("./raw//Database DPI2017", recursive = TRUE)
# Loading Wikipedia URL containing worldwide upcoming election data
url <- "https://en.wikipedia.org/wiki/List_of_next_general_elections"
# Scrape continental election tables from Wikipedia and create single
# world election data frame
# Define scraping function
.scrape_table = function(tableindex, url){
tbls_ls <- url %>%
read_html() %>%
html_nodes("table") %>%
.[tableindex] %>%
html_table(fill = TRUE)
continent <- tbls_ls[[1]]
rm(tbls_ls)
colnames(continent) <- c("country","parlTerm","parlLast","parlNext",
"presTerm","presLast","presNext","fairness",
"inPower")
continent <- continent %>%
slice(-1) %>%
dplyr::select(country, parlLast, parlNext,
presLast, presNext)
return(continent)
}
## Call scraping function for Africa, Americas, Asia and Oceania
africaDF = .scrape_table(2, url)
americaDF = .scrape_table(3, url)
asiaDF = .scrape_table(4, url)
oceaniaDF = .scrape_table(6, url)
## Europe
tbls_ls <- url %>%
read_html() %>%
html_nodes("table") %>%
.[5] %>%
html_table(fill = TRUE)
europeDF <- tbls_ls[[1]]
rm(tbls_ls)
colnames(europeDF) <- c("country","parlTerm","parlLast","parlNext",
"presTerm","presLast","presNext","fairness",
"inPower", "status")
europeDF <- europeDF %>%
slice(-1) %>%
dplyr::select(country, parlLast, parlNext,
presLast, presNext)
# Create single world election data frame
worldDF <- rbind(africaDF, americaDF, asiaDF, europeDF, oceaniaDF)
# Cleanup
rm(africaDF, americaDF, asiaDF, europeDF, oceaniaDF)
# Create country codes and deal with missing values
worldDF <- worldDF %>%
filter(!(country=="European Union")) %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
mutate(country_id = if_else(country=="Kosovo", "RKS", country_id)) %>%
mutate_at(vars(parlLast:presNext), as.character) %>%
mutate(parlLast = if_else(parlLast %in% c("","N/A"), NA_character_, parlLast),
parlNext = if_else(parlNext %in% c("","N/A"), NA_character_, parlNext),
presLast = if_else(presLast %in% c("","N/A"), NA_character_, presLast),
presNext = if_else(presNext %in% c("","N/A"), NA_character_, presNext)) %>%
mutate(presNext = if_else(presNext=="2020[6][7]", "2020", presNext))
# Define function to split day, month and year for next and last
# parliament elections to clean dates
.split_dates = function(datecolumn){
resultSplitted = as.data.frame(matrix(-1, nrow= length(datecolumn), ncol = 4))
splitted = sapply(datecolumn, str_split, pattern = " ")
for(x in 1:length(datecolumn)){
if(length(splitted[[x]]) == 1){
resultSplitted[x, 3] = splitted[[x]][1]
}
if(length(splitted[[x]]) == 2){
resultSplitted[x, 2]= splitted[[x]][1]
resultSplitted[x, 3] = splitted[[x]][2]
}
if(length(splitted[[x]]) == 3){
resultSplitted[x, 1] = splitted[[x]][1]
resultSplitted[x, 2]= splitted[[x]][2]
resultSplitted[x, 3] = splitted[[x]][3]
}
resultSplitted[x, 4] = worldDF$country_id[x]
}
return(resultSplitted)
}
# Call function to split day, month and year for next and last parliament
# elections to clean dates
# PARLIMENTARY
# Create data frame for split next PARLIMENTARY election date
resultsplittedNext = .split_dates(worldDF$parlNext)
colnames(resultsplittedNext) <- c("DayNext","MonthNext",
"YearNext","country_id")
# Create data frame for split last PARLIMENTARY election date
resultsplittedLast = .split_dates(worldDF$parlLast)
colnames(resultsplittedLast) <- c("DayLast","MonthLast",
"YearLast","country_id")
# Merge data frames for split next and last PARLIMENTARY election
# date to prepare cleaning
wikiParlDF <- left_join(resultsplittedNext,
resultsplittedLast,
by = "country_id")
rm(resultsplittedLast,resultsplittedNext)
# PRESIDENTIAL
# Create data frame for split next PRESIDENTIAL election date
resultsplittedNext = .split_dates(worldDF$presNext)
colnames(resultsplittedNext) <- c("DayNext","MonthNext",
"YearNext","country_id")
# Create data frame for split last PRESIDENTIAL election date
resultsplittedLast = .split_dates(worldDF$presLast)
colnames(resultsplittedLast) <- c("DayLast","MonthLast",
"YearLast","country_id")
# Merge data frames for split next and last PRESIDENTIAL election
# date to prepare cleaning
wikiPresDF <- left_join(resultsplittedNext,
resultsplittedLast,
by = "country_id")
rm(resultsplittedLast, resultsplittedNext, worldDF)
# Deal with dates:
# (1) take day and month of last election date if both of these
# are missing
# (2) take day of last election if only day is missing
# (3) set day to the 15th if there is a mismatch between last
# election month and future election month (e.g. parliamentary
# elections in CAF)
# Solve missing month and day
# Correct manually a few cases (like Libya) where election timeline
# is not clear at all.
wikiParlDF <- wikiParlDF %>%
mutate(MonthNext = if_else(MonthNext==-1, MonthLast, MonthNext),
DayNext = if_else(DayNext==-1, DayLast, DayNext),
DayNext = if_else(MonthNext != MonthLast, "15", DayNext)) %>%
mutate(DayNext = if_else(is.na(YearNext), NA_character_, DayNext),
MonthNext = if_else(is.na(YearNext), NA_character_, MonthNext))
# Convert to dates
wikiParlDF <- wikiParlDF %>%
unite("nextWIKIparl", DayNext:YearNext, sep = "-") %>%
mutate(nextWIKIparl = if_else(nextWIKIparl=="NA-NA-NA",
NA_character_, nextWIKIparl),
nextWIKIparl = as.Date(nextWIKIparl, "%d-%B-%Y"))
# Follow the same procedure for presidential elections
wikiPresDF <- wikiPresDF %>%
mutate(MonthNext = if_else(MonthNext==-1, MonthLast, MonthNext),
DayNext = if_else(DayNext==-1, DayLast, DayNext),
DayNext = if_else(MonthNext != MonthLast, "15", DayNext)) %>%
mutate(DayNext = if_else(is.na(YearNext), NA_character_, DayNext),
MonthNext = if_else(is.na(YearNext), NA_character_, MonthNext))
# Convert to dates
wikiPresDF <- wikiPresDF %>%
unite("nextWIKIpres", DayNext:YearNext, sep = "-") %>%
mutate(nextWIKIpres = if_else(nextWIKIpres=="NA-NA-NA",
NA_character_, nextWIKIpres),
nextWIKIpres = as.Date(nextWIKIpres, "%d-%B-%Y"))
# Merge parliament and president election data frames into one wiki
# data frame
NextWikiElectionDF <- left_join(select(wikiPresDF, nextWIKIpres:country_id),
select(wikiParlDF, nextWIKIparl:country_id),
by = "country_id")
rm(wikiParlDF, wikiPresDF)
The data is produced by the International Foundation for Electoral Systems (IFES), made available on its website, and is shared here with its permission.
Disclaimer: While IFES strives to make the information on this website as timely and accurate as possible, IFES makes no claims nor guarantees about the accuracy and completeness of the data on this site beyond what is outlined in our verification process, and expressly disclaims liability for errors and omissions in the contents of this site.
# Read in IEFS data and construct contry codes
IEFS <- read.csv(file = "./raw/IEFS_election_dates_2020.csv")
IEFS <- IEFS %>%
mutate(country_id = countrycode(Country,
origin = "country.name",
destination = "iso3c"))
# Create separate columns for presidential, parliament and senate elections
# for confirmed election dates only
# Restrict analysis to confirmed election dates for data quality reasons and
# create separate df for presidential, parliamentary and senate elections
# Drop referendums.
# Drop Mali's 2nd parliamentary round, under the assumption that the period
# between the two rounds is too short for governments to be able to hope
# for a change in popularity.
IEFS <- IEFS %>%
filter(Status=="Confirmed") %>%
filter(Body != "Referendum") %>%
dplyr::select(-c(Status, Month)) %>%
mutate(Body = as.character(Body),
Body = if_else(Body=="President Round 2", "President", Body),
Body = if_else(Body=="Governor ", "President", Body)) %>%
filter(Body != "Parliament Round 2") %>%
filter(!(country_id=="KIR" & Date=="4/15/2020")) %>%
filter(Body != "NVD") %>%
pivot_wider(id_cols = c(Country, country_id),
names_from = Body,
values_from = Date)
# Bring dates into dates format
IEFS <- IEFS %>%
mutate(Parliament = as.Date(Parliament, "%m/%d/%Y"),
President = as.Date(President, "%m/%d/%Y"),
Senate = as.Date(Senate, "%m/%d/%Y"))
# The following filter is applied because the electoral pressure measure
# is constructed as the distance of the next election from the WHO
# "pandemic" declaration on March 11, 2020. Negative distance do not make
# sense and are excluded by only taking in days after March 11, 2020
parlDF <- IEFS %>%
filter(Parliament > "2020-03-11") %>%
dplyr::select(country_id, Parliament)
presDF <- IEFS %>%
filter(President > "2020-03-11") %>%
dplyr::select(country_id, President)
senateDF <- IEFS %>%
filter(Senate > "2020-03-11") %>%
dplyr::select(country_id, Senate)
# A tricky merge, since some elections are captured in the presidential
# data set, but not in the parliamentary one, and vice-versa
mergedNextElectionDF <- full_join(parlDF,
presDF,
by = "country_id")
# For second one it's fine to do "left_join()"
mergedNextElectionDF <- left_join(mergedNextElectionDF,
senateDF,
by = "country_id")
# Bring in Wikipedia data as well
mergedNextElectionDF <- full_join(mergedNextElectionDF,
NextWikiElectionDF,
by = "country_id")
rm(IEFS, parlDF, presDF, senateDF, NextWikiElectionDF)
The data is produced by the Inter-Parliamentary Union (IPU). Based on the terms of use, the data can be shared with third parties as long as this is done free of charge and for purposes of non-commercial research.
# Read in IEFS data and construct country codes
IPU <- read.csv(file = "./raw/elections-export.csv", skip = 10) %>%
mutate(country_id = countrycode(sourcevar = Country,
origin = "country.name",
destination = "iso3c"))
# Create data frame for upper chambers with election (include: direct
# and indirect elections)
IPUsena <- IPU %>%
dplyr::select(-c(ISO.Code, Structure.of.parliament, Country)) %>%
rename(type = 2,
designation = 3,
pastIPUsena = 4,
nextIPUsena = 5) %>%
filter(type=="Upper chamber") %>%
filter(designation %in% c("Directly elected","Indirectly elected"))
IPUparl <- IPU %>%
dplyr::select(-c(ISO.Code, Structure.of.parliament, Country)) %>%
rename(type = 2,
designation = 3,
pastIPUparl = 4,
nextIPUparl = 5) %>%
filter(type=="Lower chamber") %>%
filter(designation %in% c("Directly elected","Indirectly elected"))
# Producing a date column out of these
IPUparl <- IPUparl %>%
mutate(nextIPUparl = if_else(nextIPUparl=="-", NA_character_, nextIPUparl)) %>%
mutate(nextIPUparl = as.Date(nextIPUparl, "%d %b %Y")) %>%
dplyr::select(country_id, nextIPUparl)
IPUsena <- IPUsena %>%
mutate(nextIPUsena = if_else(nextIPUsena=="-", NA_character_, nextIPUsena)) %>%
mutate(nextIPUsena = as.Date(nextIPUsena, "%d %b %Y")) %>%
dplyr::select(country_id, nextIPUsena)
## Full merge selected variables into main data frame to maintain all countries
## of both data sets
mergedNextElectionDF <- full_join(mergedNextElectionDF,
IPUparl,
by = "country_id")
mergedNextElectionDF <- full_join(mergedNextElectionDF,
IPUsena,
by = "country_id")
rm(IPUsena, IPUparl)
## Deal with other selection procedures for members of the upper chambers
## than election: in particular appointments
IPUnosena <- IPU %>%
dplyr::select(-c(ISO.Code, Structure.of.parliament, Country)) %>%
rename(type = 2,
designation = 3,
pastAppointIPUsena = 4,
nextAppointIPUsena = 5) %>%
filter(type=="Upper chamber") %>%
filter(designation %in% c("Appointed","Other")) %>%
dplyr::select(country_id) %>%
mutate(noelectionIPUsena = 1)
IPUnoparl <- IPU %>%
dplyr::select(-c(ISO.Code, Structure.of.parliament, Country)) %>%
rename(type = 2,
designation = 3,
pastAppointIPUparl = 4,
nextAppointIPUparl = 5) %>%
filter(!(type=="Upper chamber")) %>%
filter(designation %in% c("Appointed","Other")) %>%
dplyr::select(country_id) %>%
mutate(noelectionIPUparl = 1)
mergedNextElectionDF <- full_join(mergedNextElectionDF,
IPUnoparl,
by = "country_id")
mergedNextElectionDF <- full_join(mergedNextElectionDF,
IPUnosena,
by = "country_id")
rm(IPUnoparl,IPUnosena, IPU)
# Create single next election date for parliament, senate and president using
# IPU, WIKI and IEFS data taking into account data quality (IEFS > IPU > WIKI)
mergedNextElectionDF <- mergedNextElectionDF %>%
rename(nextparl = Parliament,
nextpres = President,
nextsena = Senate) %>%
mutate(nextparl = if_else(is.na(nextparl), nextIPUparl, nextparl),
nextparl = if_else(is.na(nextparl), nextWIKIparl, nextparl),
nextpres = if_else(is.na(nextpres), nextWIKIpres, nextpres),
nextsena = if_else(is.na(nextsena), nextIPUsena, nextsena)) %>%
dplyr::select(-c(nextIPUparl, nextWIKIparl, nextWIKIpres, nextIPUsena))
# Create raw distance measures for next election date for parliament, senate
# and president
# This is constructed by using the distance of the next election from the
# WHO "pandemic" declaration on March 11, 2020
# We exclude negative distance for past elections (elections prior to
# March 11,2020).
# We set days to next election to 2000 for countries with no election
# (for distance measure)
mergedNextElectionDF <- mergedNextElectionDF %>%
mutate(dist_parlm = as.numeric(nextparl - as.Date("2020-03-11")),
dist_presid = as.numeric(nextpres - as.Date("2020-03-11")),
dist_senate = as.numeric(nextsena - as.Date("2020-03-11"))) %>%
mutate(dist_parlm = if_else(dist_parlm < 0, NA_real_, dist_parlm),
dist_presid = if_else(dist_presid < 0, NA_real_, dist_presid),
dist_senate = if_else(dist_senate < 0, NA_real_, dist_senate)) %>%
mutate(distancenoelection = if_else(is.na(dist_parlm) &
is.na(dist_presid) &
is.na(dist_senate), 2000, NA_real_))
### Measure Creation ###
# 1. Create single distance measure for next election without
# discriminating between president, senate, parliament elections
mergedNextElectionDF$dist_anyelec <- apply(mergedNextElectionDF[,c(7:10)],
1,
min, na.rm = TRUE)
# 2. Create categorical electoral cycle measure (factor) with
# three categories
mergedNextElectionDF$electoralcycle <- cut(mergedNextElectionDF$dist_anyelec,
c(0, 365, 730, 2000))
# Categories: 1 = next election within one year;
# 2 = next election within two years;
# 3 = next election not within two years time from March 11, 2020
# 3. Create electoral pressure measure with range [0,1] with 1 being
# high electoral pressure and zero being no electoral pressure
mergedNextElectionDF$elect_pressure <- 1/ (apply(mergedNextElectionDF[,c(7:9)],
1,
min, na.rm = TRUE))
# Note: Here we deal with the creation of infinity values due to the
# application of the min function with dividing 1 by the days to
# next election
# -> If a state has no upcoming election scheduled, infinity is created
# and through our treatment, electoral pressure is set to be 0
#Select columns of data frame relevant for the analysis
mergedNextElectionDF <- mergedNextElectionDF %>%
dplyr::select(dist_senate, dist_presid, dist_parlm,
dist_anyelec, elect_pressure, country_id)
write.csv(mergedNextElectionDF,
file = "./saved/electoral_pressure_df.csv",
row.names = FALSE)
rm(mergedNextElectionDF)
The data is produced by Google (https://www.google.com/covid19/mobility/).
mobilityDF <- read.csv(url("https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv"))
mobilityDF <- mobilityDF %>%
filter(sub_region_1 == "") %>%
rename(retail = retail_and_recreation_percent_change_from_baseline,
grocery = grocery_and_pharmacy_percent_change_from_baseline,
parks = parks_percent_change_from_baseline,
transit = transit_stations_percent_change_from_baseline,
work = workplaces_percent_change_from_baseline,
residential = residential_percent_change_from_baseline) %>%
mutate(country_id = countrycode(country_region_code,
origin = "iso2c",
destination = "iso3c"),
mobility_index = (retail + grocery + parks + transit + work) / 5 ) %>% # Creating mobility index
mutate(country_id = if_else(is.na(country_id), "NAM", country_id)) %>%
dplyr::select(-sub_region_1, -sub_region_2, -country_region_code,
-country_region) %>%
mutate(date = as.Date(date))
## Fill missing values down
mobilityDF <- mobilityDF %>%
arrange(country_id, date) %>%
group_by(country_id) %>%
fill(mobility_index, .direction = 'down')
## Save
write.csv(mobilityDF,
file = "./saved/mobility_level_df.csv",
row.names = FALSE)
rm(mobilityDF)
In the ParlGov source, every party on the spectrum is assigned a Left-Right ideological score. Parties that are in the governing coalition are also identified. With these 2 pieces of information we can construct an ideological placement for the cabinet. Though a few choices can be made, a good start is to construct the government placement as the weighted average of the positions of the parties in government, using their seat shares as weights.1
Higher scores on this variable denote a more rightward position of the government.
Update [February 2021]: Due to the unavailability of this indicator for the majority of countries in our sample, we replaced it with a proxy from the V-DEM data: v2exl_legitideolcr_1
- the extent to which the government in power promotes a socialist or communist ideology to justify itself to the population. High values on this variable denote a socialist/communist government.
parlgovDF <- read.csv(url("http://www.parlgov.org/static/data/development-cp1252/view_cabinet.csv"),
header = TRUE)
parlgovDF <- parlgovDF %>%
dplyr::select(country_name, election_date, start_date, cabinet_name,
cabinet_party, seats, party_name_english, left_right) %>%
rename(country = country_name,
election_date = election_date,
start.date = start_date,
cab.name = cabinet_name,
cabinet = cabinet_party,
p_name = party_name_english,
lr_pos = left_right) %>%
mutate(year = as.numeric(str_sub(election_date, start = 1, end = 4))) %>%
filter(year >= 2008) %>%
# Keep only the most recent election for each country; I confirmed that
# it's truly the most recent election from Adam Carr's Electoral Archive.
mutate(start.date = as.Date(start.date)) %>%
group_by(country) %>%
filter(start.date == max(start.date, na.rm = TRUE)) %>%
ungroup() %>%
dplyr::select(-c(start.date, cab.name, year)) %>%
filter(cabinet==1) %>%
dplyr::select(-cabinet) %>%
filter(country != "Italy") %>%
na.omit()
# Save this version of the data - we'll need it to construct 2 different
# indicators in the data set.
write.csv(parlgovDF,
file = "./raw/parlgov_cabinets_df.csv",
row.names = FALSE)
rm(parlgovDF)
parlgovDF <- read.csv(file = "./raw/parlgov_cabinets_df.csv",
header = TRUE)
# Compute the position of the government
positionDF <- parlgovDF %>%
group_by(country) %>%
summarise(pos_gov_lr = sum(seats*lr_pos)/sum(seats)) %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
dplyr::select(-country)
write.csv(positionDF,
file = "./saved/gov_position_df.csv",
row.names = FALSE)
rm(positionDF, parlgovDF)
Taken from https://en.wikipedia.org/wiki/List_of_elected_and_appointed_female_heads_of_state_and_government#Elected_or_appointed_female_chief_executives and gives 1 to a state with a woman head of government on 1 Jan 2020. This includes Austria where Brigitte Bierlein left office on 7 Jan and it excludes and Marshall islands where Hilda Heine left on 14 Jan.
woman_leader_df <- WDI::WDI_data$country %>%
data.frame() %>%
select(iso3c, country) %>%
mutate(woman_leader = 1*(iso3c %in% c("AUT", "BEL", "BGD",
"BOL", "BRB", "DEU",
"DNK", "FIN", "ISL",
"MHL", "MMR", "NAM",
"NOR", "NZL", "SRB",
"TWN"))) %>%
dplyr::select(iso3c, woman_leader) %>%
mutate(iso3c = if_else(iso3c=="XKX", "RKS", iso3c)) %>%
rename(country_id = iso3c)
nrow(woman_leader_df[woman_leader_df$woman_leader==1,]) == 16
[1] TRUE
write.csv(woman_leader_df,
"./saved/woman_leader_df.csv",
row.names = FALSE)
rm(woman_leader_df)
Data obtained from the Oxford COVID-19 Government Response Tracker (OxCGRT): https://www.bsg.ox.ac.uk/research/research-projects/coronavirus-government-response-tracker.
# Read in Oxford stringency measures, and filter to ensure only
# national-level data is used
stringencyDF <- read.csv(url("https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv")) %>%
filter(Jurisdiction=="NAT_TOTAL") %>%
select(CountryCode, Date, StringencyIndex, C1_School.closing,
C2_Workplace.closing, C3_Cancel.public.events,
C4_Restrictions.on.gatherings, C5_Close.public.transport,
C6_Stay.at.home.requirements, C7_Restrictions.on.internal.movement,
C8_International.travel.controls, H1_Public.information.campaigns) %>%
mutate(Date = as.Date(as.character(Date), "%Y%m%d")) %>%
rename(country_id = CountryCode,
date = Date)
## Fill missing values down
stringencyDF <- stringencyDF %>%
arrange(country_id, date) %>%
group_by(country_id) %>%
fill(StringencyIndex, .direction = 'down') %>%
rename(stringency = StringencyIndex)
write.csv(stringencyDF,
file = "./saved/stringency_df.csv",
row.names = FALSE)
rm(stringencyDF)
Data obtained from the Covid-19 excess mortality trackers maintained by The Economist, the New York Times, and the Financial Times
ed_economist <- read_csv('https://raw.githubusercontent.com/TheEconomist/covid-19-excess-deaths-tracker/master/output-data/excess-deaths/all_weekly_excess_deaths.csv') %>%
dplyr::select(-region_code)
ed_economist_monthly <- read_csv('https://raw.githubusercontent.com/TheEconomist/covid-19-excess-deaths-tracker/master/output-data/excess-deaths/all_monthly_excess_deaths.csv') %>%
filter(!country %in% ed_economist$country) %>%
dplyr::select(one_of(colnames(ed_economist)))
ed_economist_qtly <- read_csv('https://raw.githubusercontent.com/TheEconomist/covid-19-excess-deaths-tracker/master/output-data/excess-deaths/all_quarterly_excess_deaths.csv') %>%
filter(!country %in% c(ed_economist$country,
ed_economist_monthly$country)) %>%
dplyr::select(one_of(colnames(ed_economist)))
## Cleanup
ed_economist_full <- ed_economist %>%
bind_rows(ed_economist_monthly) %>%
bind_rows(ed_economist_qtly) %>%
filter(!country == 'Kosovo') %>%
filter(country == region) %>%
dplyr::select(country, end_date, excess_deaths) %>%
dplyr::rename(date = end_date,
excess_deaths_weekly = excess_deaths) %>%
mutate(country_id = countrycode(country,
origin = "country.name",
destination = "iso3c")) %>%
dplyr::select(-country)
## Gen var that says when the last observation occured
ed_economist_full <- ed_economist_full %>%
arrange(country_id, date) %>%
group_by(country_id) %>%
mutate(excess_deaths_last_obs = ifelse(row_number() == n(),
1, 0)) %>%
ungroup()
## Cum excess deaths
ed_economist_full <- ed_economist_full %>%
arrange(country_id, date) %>%
group_by(country_id) %>%
mutate(excess_deaths_cum = cumsum(excess_deaths_weekly),
excess_deaths_cum_log = log(1+excess_deaths_cum)) %>%
mutate(excess_deaths_cum_log = ifelse(is.nan(excess_deaths_cum_log),
NA,
excess_deaths_cum_log)) %>%
ungroup()
## Save
write.csv(ed_economist_full,
file = "./saved/excess_deaths.csv",
row.names = FALSE)
Merge data sets and rescale variables.
wdi_region <- WDI::WDI_data$country %>%
data.frame() %>%
select(iso3c, region) %>%
rename("country_id" = "iso3c")
df_wide <- read.csv("saved/corona_df.csv") %>%
dplyr::select(country_id, country) %>%
distinct(country_id, country) %>%
filter(!is.na(country)) %>%
arrange(country_id) %>%
left_join(wdi_region, by = "country_id")
to_join <- c("wdi_df.csv", "polity4_df.csv", "swiid_df.csv", "eps_df.csv",
"vdem_indexes_df.csv", "trust_people_df.csv","trust_gov_df.csv",
"populism_df.csv", "federalism_df.csv", "woman_leader_df.csv",
"dpi_checks_df.csv", "electoral_pressure_df.csv",
"past_epidemics_df.csv","qog_df.csv", "share_health_ins.csv",
"pandemic_prep_df.csv", "respiratory_disease_df.csv",
"ghsi_detect_df.csv", "ghsi_df.csv", "ghsi_health_df.csv",
"ghsi_respond_df.csv","state_fragility_df.csv","pr_df.csv")
print(c(nrow(df_wide)))
[1] 218
for(j in to_join){
df_wide <- left_join(df_wide,
read.csv(paste0("saved/", j)),
by = c("country_id"))
}
df_wide <-
df_wide %>%
mutate(infection = as.numeric(infection),
share_older = 100*(older_m + older_f)/pop_tot,
pop_tot = pop_tot/1000000,
pop_tot_log = log(pop_tot),
gdp_pc = gdp_pc/1000,
air_travel = log(1+air_travel),
pop_density_log = log(pop_density),
infection = if_else(is.na(infection), 0, infection)) %>%
dplyr::select(-region.y) %>%
rename(region = region.x)
write.csv(df_wide,
file = "docs/df_wide.csv",
row.names = FALSE)
print(dim(df_wide))
[1] 218 86
corona <-
read.csv("saved/corona_df.csv")%>%
mutate(date = as.Date(date))
mobility <-
read.csv("saved/mobility_level_df.csv") %>%
mutate(date = as.Date(date))
# Clean-up mobility cases that are causing issues
metro_areas <- unique(mobility$metro_area)[-1]
mobility <- mobility %>%
filter(!(metro_area %in% metro_areas))
rm(metro_areas)
weatherDF <-
read.csv("saved/weather_df.csv")
policyDF <-
read.csv("saved/acaps_df.csv") %>%
mutate(date = as.Date(date))
stringency <-
read.csv("saved/stringency_df.csv") %>%
mutate(date = as.Date(date))
## Excess deaths
# These use the epidemiological week, which starts on Sunday.
# The assumption is made that the excess deaths reported for 2020-01-12 are
# for the Jan 6 - Jan 11 week, and should therefore be merged with the
# cumulative deaths reported for the week that ends in Jan 12
ed <- read_csv('saved/excess_deaths.csv') %>%
mutate(date = as.Date(date),
date = date + 1)
df <- left_join(corona, select(df_wide, -country), by = c("country_id")) %>%
dplyr::select(-year.y) %>%
rename(year = year.x) %>%
left_join(policyDF, by = c("country_id","date")) %>%
left_join(mobility, by = c("country_id","date")) %>%
left_join(stringency, by = c("country_id","date")) %>%
left_join(weatherDF, by = c("country_id","month","year"))
## Excess deaths: only merge last day data is reported
ed_last_day <- ed %>%
filter(excess_deaths_last_obs == 1) %>%
mutate(weeknumber = paste0(lubridate::week(date),
'_', lubridate::year(date))) %>%
dplyr::select(-date)
## Get week and year
df <- df %>%
mutate(weeknumber = paste0(lubridate::week(date),
'_', lubridate::year(date)))
## Try merging now
df <- df %>%
left_join(ed_last_day, by = c("country_id","weeknumber")) %>%
mutate(deaths_cum_per_million = deaths_cum/pop_tot,
deaths_cum_per_million_log = log(1 + deaths_cum_per_million),
excess_deaths_cum_per_million = excess_deaths_cum / pop_tot,
excess_deaths_cum_per_million_log = log(1 + excess_deaths_cum_per_million)) %>%
rename(geoid2 = country_id)
## Check again
# ed %>% pull(country_id) %>% unique()
# df %>% filter(!is.na(excess_deaths_cum_per_million)) %>%
# pull(geoid2) %>% unique()
## Excess deaths need to be filled down, since original data is weekly
#
# df <- df %>%
# arrange(geoid2, date) %>%
# group_by(geoid2) %>%
# fill(excess_deaths_cum,
# excess_deaths_cum_log,
# excess_deaths_cum_per_million,
# excess_deaths_cum_per_million_log,
# .direction = 'down') %>%
# mutate(excess_deaths_cum_per_million_log = ifelse(is.nan(excess_deaths_cum_per_million_log),
# NA,
# excess_deaths_cum_per_million_log)) %>%
# mutate_at(vars(excess_deaths_cum, excess_deaths_cum_log,
# excess_deaths_cum_per_million, excess_deaths_cum_per_million_log),
# list(~ifelse(row_number() > which(excess_deaths_last_obs == 1), NA, .))) %>%
# ungroup()
We make a version of the data that records number of deaths per million \(T\) days after first hitting \(D\) cases. Note currently data sparse so \(T\) and \(D\) low, and start benchmarked to cases rather than deaths.
Create cumulative deaths variable lagged to days since \(D\) cases.
D <- 10
T <- 10
df <- df %>%
group_by(forcats::fct_explicit_na(geoid2)) %>%
mutate(relative_start =
ifelse(any(cases_cum >= D),
min(elapsed[cases_cum >= D]), NA),
elapsed_rel = elapsed - relative_start) %>%
ungroup() %>%
group_by(forcats::fct_explicit_na(geoid2)) %>%
mutate(
relative_start_d = ifelse(any(deaths_cum >= D),
min(elapsed[deaths_cum >= D]), NA),
elapsed_rel_d = elapsed - relative_start_d
) %>%
ungroup()
rm(D,T)
# Generate country name
df <- df %>%
mutate(country = countrycode(geoid2,
origin = "iso3c",
destination = "country.name")) %>%
mutate(country = if_else(geoid2=="RKS", "Kosovo", country))
df <- df %>%
arrange(geoid2, date) %>%
group_by(geoid2) %>%
fill(mobility_index, .direction = 'down')
df <- df %>%
dplyr::select(-starts_with("X."))
Check most recent data date:
data_date <- max(as.Date(df$date_rep), na.rm = TRUE)
if( (df %>% filter(date_rep == data_date) %>% nrow()) <
(df %>% filter(date_rep == data_date-1) %>% nrow())) data_date <- data_date - 1
data_date
[1] "2021-11-22"
# Full data for download
df_full <- df
write.csv(df_full, "docs/df_full.csv")
# Subset for analysis: Remove small population countries; China
df <- filter(df, pop_tot >=1 & geoid2 != "CHN")
write.csv(df, "saved/df.csv")
Implement Lasso analyses to select control variables
df <- read.csv("saved/df.csv") %>%
mutate(date = as.Date(date),
date_rep = as.Date(date_rep, "%d/%m/%Y"))
#### Lockdown and distancing: use last non-missing
df <- df %>%
group_by(geoid2) %>%
arrange(date) %>%
fill(lockdown_bin, distancing_bin, .direction = 'down') %>%
ungroup()
glmnet
results_lasso <- function(Y,
X,
data = df,
standardize = TRUE,
impute_mean = TRUE,
impute_extreme_outliers = F,
nfolds = 5,
test_prop = 0.2,
seed_train_test_split = NULL,
use_logit_if_outcome_binary = F,
use_most_recent_date = T,
freeze_date = "2020-05-11",
use_lambda_min = T) {
## Select date (days since Jan 1)
if (use_most_recent_date) {
data_date <- max(as.Date(df$date_rep), na.rm = TRUE)
if((df %>% filter(date_rep == data_date) %>% nrow()) <
(df %>% filter(date_rep == data_date-1) %>% nrow())) data_date <- data_date - 1
mod_df <- df %>% filter(date == data_date)
} else {
mod_df <- data %>% filter(date == as.Date(freeze_date))
}
## Imputation
if (impute_mean) {
mod_df <- mod_df %>%
mutate_at(vars(one_of(X)), list(~ifelse(is.na(.), mean(., na.rm = T),
.)))
}
## Non-missing obs
mod_df <- mod_df[complete.cases(mod_df[, c(Y, X)]), ] %>%
dplyr::select(country, geoid2, one_of(c(X, Y)))
## Impute zero for extreme outliers
if (impute_extreme_outliers) {
mod_df <- mod_df %>%
mutate_at(vars(one_of(X)),
list(~ifelse(abs(. /sd(., na.rm = T)) > 10, mean(., na.rm = T), .)))
}
## Scaling
if (standardize) {
mod_df <-
mod_df %>%
mutate_at(vars(one_of(X)), list(~(.-mean(.)) / sd(.)))
}
## Get test/ train split
## Optional seed
if (!is.null(seed_train_test_split)) {
if (!is.numeric(seed_train_test_split)) stop('Seed must be numeric')
set.seed(seed_train_test_split)
} else {
set.seed(123)
}
test_id <- sample(1:nrow(mod_df), floor(nrow(mod_df) * test_prop))
## Model
x_train = mod_df[-test_id, X] %>% as.matrix()
y_train = mod_df[-test_id, Y] %>% pull(!!Y) %>% as.numeric()
## Model
x_test = mod_df[test_id, X] %>% as.matrix()
y_test = mod_df[test_id, Y] %>% pull(!!Y) %>% as.numeric()
## If binary, do logistic instead
if ((length(unique(y_train)) == 2) & use_logit_if_outcome_binary) {
cv <- cv.glmnet(x = x_train, y = y_train,
nfolds = nfolds,
alpha = 1,
family = 'binomial',
type.measure = 'auc') # Fit lasso
##
if (use_lambda_min) {
lambda_use <- cv$lambda.min
} else {
lambda_use <- cv$lambda.1se
}
lasso_mod <- glmnet(x_train,
y_train,
alpha = 1,
lambda = lambda_use,
family = 'binomial') # Fit lasso model on training data
} else {
# Fit lasso
cv <- cv.glmnet(x = x_train, y = y_train, nfolds = nfolds, alpha = 1)
if (use_lambda_min) {
lambda_use <- cv$lambda.min
} else {
lambda_use <- cv$lambda.1se
}
# Fit lasso model on training data
lasso_mod <- glmnet(x_train, y_train, alpha = 1, lambda = lambda_use)
}
lasso_coef <- coef(lasso_mod) %>% as.matrix() %>%
data.frame(vars = row.names(.), coef = ., stringsAsFactors = F) %>%
rename(coef = 2)
y_hat_train = predict(lasso_mod, s = lambda_use,
newx = x_train) %>%
as.numeric() # Use best lambda to predict test data
y_hat_test = predict(lasso_mod, s = lambda_use,
newx = x_test) %>%
as.numeric() # Use best lambda to predict test data
## Return
pred_list <- data.frame(y = y_train, y_hat = as.numeric(y_hat_train),
stringsAsFactors = F) %>%
bind_cols(data.frame(country = as.character(mod_df$country[-test_id]),
stringsAsFactors = F))%>%
bind_cols(data.frame(geoid2 = as.character(mod_df$geoid2[-test_id]),
stringsAsFactors = F))
## Get RMSE for the full se and from CV
rmse_train <- sqrt(mean((y_hat_train - y_train)^2))
rmse_test <- sqrt(mean((y_hat_test - y_test)^2))
r2_train <- 1 - (sum((y_train - y_hat_train)^2) / sum((y_train - mean(y_train))^2))
r2_test <- 1 - (sum((y_test - y_hat_test)^2) / sum((y_test - mean(y_train))^2))
## RMSE from an empty model
## We use the mean of the y_train here twice, since the prediction from the training data is always the training mean
## this the case for in-sample and out sample prediction
rmse_empty_train <- sqrt(mean((y_train - mean(y_train, na.rm = T))^2))
rmse_empty_test <- sqrt(mean((y_test - mean(y_train, na.rm = T))^2))
r2_empty_train <- 0
r2_empty_test <- 0
## Add to lcoef
lasso_coef <- lasso_coef %>%
mutate(rmse_train = rmse_train,
rmse_test = rmse_test,
rmse_empty_train = rmse_empty_train,
rmse_empty_test = rmse_empty_test,
r2_train = r2_train,
r2_test = r2_test,
r2_empty_test = r2_empty_test,
r2_empty_train = r2_empty_train)
## Add to LASSO coef
out <- pred_list %>%
mutate(outcome = Y) %>%
mutate(n_pred = length(X)) %>%
mutate(n = n())
list('output' = out, 'coefs' = lasso_coef)
}
## Define outcomes
Y_list <- c('deaths_cum_log',
'deaths_cum_per_million_log')
Y_labels = c('Deaths total (logged)',
'Deaths/million (logged)')
## ##
bla <- df %>%
group_by(date) %>%
summarise_at(vars(Y_list), ~sum(!is.na(.)) / n()) %>%
mutate(date = as.Date(date)) %>%
pivot_longer(cols = -'date', names_to = 'what', values_to = 'value') %>%
data.frame(stringsAsFactors = F) %>%
ggplot(aes(date, value), group = 1) +
geom_line() +
ylab('Share valid (not missing) obs') +
facet_wrap(~what)
## bla
## Declare families
families <- c('econ_vars',
'epi_vars',
'phys_vars',
'health_sys_vars')
polfamilies <- c('state_cap_vars', 'social_vars', 'pol_account_vars')
## Political and social variables
measures_pol <- read_csv('measures.csv') %>%
filter(!vars %in% c('cases', 'deaths')) %>%
filter(family %in% polfamilies) %>%
filter(vars %in% colnames(df)) %>%
pull(vars) %>% unique() %>%
.[!str_detect(., 'sanitation')]
## Drop vars based on missing
miss_share_pol <- sapply(measures_pol, function(x) sum(is.na(df[, x])) / nrow(df))
measures_pol <- measures_pol[!miss_share_pol > missing_threshold] %>%
unique()
## Get the table
measures <- read_csv('measures.csv') %>%
filter(!vars %in% c('cases', 'deaths')) %>%
filter(family %in% families) %>%
filter(vars %in% colnames(df)) %>%
pull(vars) %>% unique() %>%
.[!str_detect(., 'sanitation')]
## Drop vars based on missing
miss_share <- sapply(measures, function(x) sum(is.na(df[, x])) / nrow(df))
measures <- measures[!miss_share > missing_threshold] %>%
unique()
## Save this
write_rds(measures, 'saved/lasso_control_set.rds')
This is used to select controls for the paper.
# Y = Y_list_full[1]
Y = Y_list[1]
lcoef <- lapply(Y_list, function(Y) {
print(Y)
## Define data set (no missings)
lasso_data <- df[complete.cases(df[, unique(c(Y, measures_pol, measures))]), ]
## The seed is for the train/test split
## T-T split happens inside the function
set.seed(123)
results_lasso(Y = Y, X = measures,
data = lasso_data, nfolds = nfolds,
use_most_recent_date = F, impute_extreme_outliers = T)[[2]] %>%
mutate(outcome = Y)
}) %>%
reduce(rbind) %>%
left_join(data.frame(Y_list, Y_labels, stringsAsFactors = FALSE),
by = c('outcome' = 'Y_list')) %>%
filter(!str_detect(vars, 'Intercept')) %>%
left_join(read_csv('measures.csv') %>%
dplyr::select(vars,labels) %>%
mutate(labels = str_remove(labels, '\\.'))) %>%
filter((!coef == 0) & (outcome %in% Y_list)) %>%
mutate(labels = str_remove(labels, '\\.'))
[1] "deaths_cum_log"
[1] "deaths_cum_per_million_log"
## Line breaks for labels
lcoef$labels <- sapply(lcoef$labels, add_linebreak, min_length = 10)
## Lasso for 1-7
set.seed(123)
lcoef_pol <- lapply(Y_list, function(Y) {
## Define data set (no missings)
lasso_data <- df[complete.cases(df[, unique(c(Y, measures_pol, measures))]), ]
## The seed is for the train/test split
## T-T split happens inside the function
set.seed(123)
results_lasso(Y = Y, X = unique(c(measures_pol, measures)),
data = lasso_data, nfolds = nfolds,
use_most_recent_date = F, impute_extreme_outliers = T)[[2]] %>%
mutate(outcome = Y)
}) %>% reduce(rbind) %>%
left_join(data.frame(Y_list, Y_labels, stringsAsFactors = F),
by = c('outcome' = 'Y_list')) %>%
filter(!str_detect(vars, 'Intercept')) %>%
left_join(read_csv('measures.csv') %>% dplyr::select(vars,labels) %>%
mutate(labels = str_remove(labels, '\\.'))) %>%
mutate(pol_soc = ifelse((vars %in% measures_pol) &
(!vars %in% measures), 'Yes', 'No')) %>%
filter((!coef == 0) & (outcome %in% Y_list)) %>%
mutate(labels = str_remove(labels, '\\.'))
## Line breaks for labels
lcoef_pol$labels <- sapply(lcoef_pol$labels, add_linebreak, min_length = 10)
write_rds(list(
'lcoef_controls' = lcoef %>% group_by(outcome) %>%
filter(!duplicated(vars)) %>% ungroup(),
'lcoef_pol' = lcoef_pol %>% group_by(outcome) %>%
filter(!duplicated(vars)) %>% ungroup()),
'saved/lasso_res.rds')
fun_repl <- function(seed_tt, ...) {
lcoef <- lapply(Y_list, function(Y) {
# print(Y)
## Define data set (no missings)
lasso_data <- df[complete.cases(df[, unique(c(Y, measures_pol, measures))]), ]
## The seed is for the train/test split
## T-T split happens inside the function
results_lasso(Y = Y, X = measures,
data = lasso_data, nfolds = nfolds,
use_most_recent_date = F, impute_extreme_outliers = T, seed_train_test_split = seed_tt)[[2]] %>%
mutate(outcome = Y) %>%
mutate(seed = seed_tt)
}) %>%
reduce(rbind) %>%
left_join(data.frame(Y_list, Y_labels, stringsAsFactors = FALSE),
by = c('outcome' = 'Y_list')) %>%
filter(!str_detect(vars, 'Intercept')) %>%
left_join(read_csv('measures.csv') %>%
dplyr::select(vars,labels) %>%
mutate(labels = str_remove(labels, '\\.'))) %>%
filter((!coef == 0) & (outcome %in% Y_list)) %>%
mutate(labels = str_remove(labels, '\\.'))
## Line breaks for labels
lcoef$labels <- sapply(lcoef$labels, add_linebreak, min_length = 10)
## Return
lcoef
}
out <- pblapply(1:500, function(i) suppressMessages(fun_repl(seed_tt = i, ))) %>% reduce(rbind)
## Per capita
out2_pc <- out %>% filter(outcome == "deaths_cum_per_million_log") %>%
group_by(seed) %>%
filter(n() %in% 4:6) %>%
arrange(vars) %>%
mutate(group = paste0(vars, collapse = ' ')) %>%
summarise(group = first(group)) %>%
ungroup() %>%
pull(group) %>% table() %>%
sort(decreasing = T) %>% .[1] %>% names() %>%
str_split(' ') %>% unlist()
## Total
out2_total <- out %>% filter(outcome == "deaths_cum_log") %>%
group_by(seed) %>%
filter(n() %in% 4:6) %>%
arrange(vars) %>%
mutate(group = paste0(vars, collapse = ' ')) %>%
summarise(group = first(group)) %>%
ungroup() %>%
pull(group) %>% table() %>%
sort(decreasing = T) %>% .[1] %>% names() %>%
str_split(' ') %>% unlist()
## Save this
lcoef_alt <- data.frame(vars = out2_pc, outcome = 'death_cum_per_million_log')
lcoef_alt <- lcoef_alt %>%
left_join(read_csv('measures.csv') %>% dplyr::select(vars, labels))
## Save this
lcoef_alt_total <- data.frame(vars = out2_total, outcome = 'death_cum_log')
lcoef_alt_total <- lcoef_alt_total %>%
left_join(read_csv('measures.csv') %>% dplyr::select(vars, labels))
## Save this
write_rds(lcoef_alt_total, "saved/lasso_controls_total.rds")
write_rds(lcoef_alt, "saved/lasso_controls_pc.rds")
Other alternatives exist. If we assume that a party withdrawing from the government coalition can bring the entire government down, then an unweighted average of party positions might be a better indication of the government’s policy orientation. We might also consider that the party which holds the portfolio of prime minister has an disproportionate influence on the orientation of the government, leading to a higher weight for this party. A similar logic could be applied for the party which holds the Economy or Finance portfolio.↩︎