1 Set up

This code takes cleaned and aggregated data as input. Cleaning and aggregation is done in 1_cleaning.Rmd.

1.1 Helper functions

Code used to prepare data (prepare suvery weights), run analyses and robustness checks.

# The helper renormalizes weights so that each study gets the 
# same total weight even if they are missing data

study_weighting <- function(data)
  data %>% 
    dplyr::group_by(country) %>% 
    dplyr::mutate(weight = weight/sum(weight)) %>% 
    dplyr::ungroup() 

lm_helper <- function(data,...) {
  data %>% 
    study_weighting() %>% 
    estimatr::lm_robust(data = .,...) %>% 
    {bind_cols( tidy(.), n = nobs(.) )}
}

# Leave-X-out helper that takes data and sample_var, 
# nests by sample_var performs LOO with loo_n observations out
# applying loo_fun to each sample

loo_helper <- 
  function(data, 
           sample_var, 
           loo_n = 1,
           loo_fun = 
             function(dat) lm_helper(data = dat, 
                                     formula = take_vaccine_num ~ 1, cluster = cluster,
                                     weight = weight, se_type = "stata")
  ) {
    
  .var <- data[[sample_var]]
  
  data %>% 
    {
      plyr::adply(.data = combn(unique(.var), loo_n), 
                  .margins = 2, 
                  .fun = function(x) loo_fun(.[!(.var %in% x), ]) )
    }
}

# Subgroup analysis : Function to apply analysis function over groups

grp_analysis <- function(df, y, x)
  
  df %>%
    dplyr::filter(if_all(c(all_of(x), all_of(y), cluster, weight), ~ !is.na(.))) %>%
    dplyr::nest_by(group, get(x)) %>%
    dplyr::summarize(
      lm_helper(data = data, 
                formula = as.formula(paste0(y, "~ 1")), cluster = cluster,
                weight = weight, se_type = "stata"), .groups = "drop") %>% 
    dplyr::rename(!!x := "get(x)")
  


# Reasons analysis: Function to apply analysis function over groups

reasons_together <- function(df, reason, num = "Yes")
  
  df %>%
    dplyr::filter(take_vaccine %in% num, 
                  if_all(c(all_of(reason), cluster, weight), ~ !is.na(.))) %>%
    dplyr::nest_by(group) %>%
    dplyr::summarize(
      lm_helper(data = data, 
                formula = as.formula(paste0(reason, "~ 1")), 
                cluster = cluster,
                weight = weight, se_type = "stata"), .groups = "drop")


reasons_together_subgroup <- function(df, reason, num = "Yes", 
                                      dem_group = NA, dem_subgroup = NA){
  
  if (dem_group == "gender")
    df <- filter(df, gender %in% dem_subgroup)
  
  df %>%
    dplyr::filter(take_vaccine %in% num,
                  !is.na(get(reason))) %>%
    dplyr::nest_by(group) %>%
    dplyr::summarize(
      lm_helper(data = data, 
                formula = as.formula(paste0(reason, "~ 1")), cluster = cluster,
                weight = weight, se_type = "stata"), .groups = "drop")
}

# Age analysis for reasons

age_analysis <- function(df, reason, num = "Yes", filter_by=NA){
  df %>%
    dplyr::filter({{filter_by}}==1)  %>%
    dplyr::filter(take_vaccine %in% num, 
                  if_all(c(all_of(reason), cluster, weight), ~ !is.na(.))) %>%
    dplyr::nest_by(group) %>%
    dplyr::summarize(
      lm_helper(data = data, 
                formula = as.formula(paste0(reason, "~ 1")), 
                cluster = cluster,
                weight = weight, se_type = "stata"), .groups = "drop")
}

1.2 Final cleaning and harmonization of weights

Groups variables into discrete categories and prepares survey weights.

# Call data created in 1_cleaning.Rmd
df <- readr::read_csv("3_rep_data/combined.csv", guess_max = 30000)

# If no cluster information given for a study then individuals are clusters 
# Ensure cluster ids are distinct across studies
df <- 
  df %>% 
  dplyr::group_by(study) %>% 
  dplyr::mutate(
    cluster = ifelse(is.na(cluster), paste(1:n()), cluster),
    cluster = paste0(gsub(" ", "_", tolower(country)), "_", cluster))


# Weights sum to 1 in each study and recode age and education into bins
df <- 
  df %>% 
  dplyr::group_by(study) %>% 
  dplyr::mutate(
         weight_replace = mean(weight, rm.na = TRUE),
         weight = if_else(is.na(weight), if_else(is.na(weight_replace), 1, weight_replace), weight),
         weight = weight/sum(weight)) %>% 
  dplyr::ungroup() %>%
  dplyr::mutate(
    age_groups = as.character(cut(x = age, breaks = c(-Inf, 18, 30, 45, 60, +Inf), right = F)),
    age_groups_binary = ifelse(age >= 55, "55+", NA),
    age_groups_binary = ifelse(age < 55, "<55", age_groups_binary),
    age_less24 = ifelse(age <= 24, 1, 0),
    age_25_54 = ifelse(age >= 25 & age <= 54, 1, 0),
    age_55_more = ifelse(age >= 55, 1, 0),
    age_groups_three = ifelse(age <= 24, "<25", NA),
    age_groups_three = ifelse(age >= 25 & age <= 54, "25-54", age_groups_three),
    age_groups_three = ifelse(age >= 55, "55+", age_groups_three),
    educ_binary = if_else(educ == "More than secondary", "> Secondary", "Up to Secondary")) 


# We create a new dataframe with countries and with "All" (only LMICs). Countries are clusters in "All" analysis
# USA and Russia excluded from "All" set

df2 <- 
  dplyr::bind_rows(
    mutate(df, group = country),
    mutate(filter(df, country != "USA" & country != "Russia"), group = "All")) %>% 
  mutate(
    cluster = if_else(group == "All", 
                      gsub(pattern = " ", replacement = "_", x = tolower(country)), 
                      cluster)) 

1.3 Data checks

Checks on data structure. Note n, missingness, presence of data on weights or clusters.

1.3.1 Data structure

df %>% group_by(country) %>%
  summarize(n = n(), 
            sd_wt = sd(weight, na.rm = TRUE)/mean(weight, na.rm = TRUE), 
            cl_size = n()/length(unique(cluster)), 
            take_1 = mean((take_vaccine == "Yes")[take_vaccine == "Yes" | take_vaccine == "No" ], na.rm = TRUE),
            take_2 = mean((take_vaccine == "Yes"), na.rm = TRUE),
            take_3 = mean(take_vaccine_num, na.rm = TRUE),
            take_dk = mean(take_vaccine == "DK"),
            .take = mean(is.na(take_vaccine_num)), .age = mean(is.na(age)),
            .gender = mean(is.na(gender)), .educ = mean(is.na(educ))) %>%
  kable(digits = 2, 
        caption = "Observations, missingness patterns, data structure. Column .var is the share missing for variable var.", booktabs = TRUE, linesep = "", format.args = list(big.mark = ",", 
  scientific = FALSE))
Observations, missingness patterns, data structure. Column .var is the share missing for variable var.
country n sd_wt cl_size take_1 take_2 take_3 take_dk .take .age .gender .educ
Burkina Faso 977 0.08 1.00 0.69 0.67 0.67 . 0 0.88 0.00 0.00
Colombia 1,012 0.16 1.00 0.79 0.79 0.75 . 0 0.32 0.00 0.00
India 1,680 0.62 11.83 0.83 0.83 0.84 0.00 0 0.00 0.00 0.80
Mozambique 862 0.00 5.29 0.91 0.91 0.89 . 0 0.00 0.00 0.04
Nepal 1,389 0.51 15.43 0.97 0.97 0.97 0.00 0 0.05 0.05 1.00
Nigeria 1,868 0.00 1.00 0.78 0.78 0.76 . 0 0.00 0.00 1.00
Pakistan 1 1,633 1.17 15.41 0.86 0.73 0.72 . 0 0.00 0.00 0.01
Pakistan 2 1,492 0.00 1.00 0.84 0.67 0.66 . 0 1.00 1.00 0.00
Russia 22,125 1.86 1.00 0.37 0.27 0.27 0.27 0 0.00 0.00 0.00
Rwanda 1,355 0.04 1.00 0.94 0.94 0.85 . 0 0.00 0.00 0.00
Sierra Leone 1 1,070 0.06 1.00 0.78 0.78 0.78 0.00 0 0.00 0.00 0.03
Sierra Leone 2 2,110 0.00 11.05 0.88 0.88 0.88 . 0 0.01 0.00 0.00
Uganda 1 3,362 0.00 6.75 0.91 0.91 0.86 . 0 0.05 0.00 0.19
Uganda 2 1,366 0.00 4.41 0.79 0.77 0.77 . 0 0.00 0.00 0.00
USA 1,959 0.77 1.00 0.74 0.74 0.67 . 0 0.00 0.00 0.00

1.3.2 Age distribution

df %>% 
  select(country, age) %>%
  gather(category, value, -country) %>%
  mutate(value = as.numeric(value)) %>%
  ggplot(aes(value)) + geom_density() + facet_wrap(~country, ncol = 3)

1.3.3 Gender, education distribution

df %>% 
  select(country, gender, educ) %>%
  gather(category, value, -country) %>%
    ggplot(aes(value)) +  geom_bar() + facet_grid(country ~ category, scales = "free")