POL346 Precept 10: Missing Data

Load new packages

# tidyverse
library(dplyr)
library(ggplot2)
library(stringr)

# summary stats
library(skimr)

# packages for missingness
library(simputation)
library(Amelia)
library(naniar)

Load riskfactors data

data(riskfactors) # load data
riskfactors <- riskfactors %>% janitor::clean_names()

Explore data

riskfactors %>% 
ggplot() + 
  aes(x = height_inch, y = weight_lbs) + 
  geom_miss_point()

Review

View missingness with factors

Simple imputation

  1. Create a new column weight_imp and impute height with naniar::impute_mean()
  1. Now create a column weight_num and a missingness dummy weight_NA
riskfactors <- riskfactors %>% 
  mutate(
    weight_num = weight_lbs %>% as.numeric(),
    weight_NA  = ifelse(is.na(weight_lbs), "NA", "!NA")
    )
  1. Try simputation::impute_median by a factor:
risk_imp <- riskfactors %>% 
  simputation::impute_median(weight_num ~ marital)
risk_imp %>% 
  ggplot() + 
    aes(y = weight_num, x = marital, color = weight_NA) + 
    geom_jitter()
  1. Try simputation::impute_lm:
  risk_imp <- riskfactors %>% 
                simputation::impute_lm(weight_num ~ ...)
risk_imp %>% 
  filter(weight_NA == "NA") %>% 
  select(weight_lb, weight_num)
  1. Try to impute health_poor with impute_lm

Multiple imputation

  1. For now focus on height_inch, weight_lbs and health_poor. Subset the data to these three variables with something like:
risk2 <- riskfactors %>% 
  select(weight_lbs, height_inch, health_poor, weight_NA)
risk2  <- risk2 %>% as.data.frame()
summary(risk2$weight_lbs)
  1. Complete the bounds_matrix for height_inch and health_poor
# which column has the relevant string?
str_which(names(riskfactors), "weight_lbs")
str_which(names(riskfactors), "height_inch")
str_which(names(riskfactors), "health_poor")
bounds_matrix <- matrix(                 
    ncol  = 3, 
    byrow = TRUE,
    data  = c(1, 96, 410, # bound col 1 (weight_lbs), 96 to 410
               ,   ,    , # bound col X (height_inch)
               ,   ,      # bound col X (health_poor)
              )  
)

# check bounds_matrix
bounds_matrix
  1. Impute with Amelia
# create a vector of variables for Amelia to ignore (including height and health_poor)
ignore_vars <- c("weight_NA")

# impute missing data
risk_amelia_out <- amelia(
  x      = risk2,         # data set
  m      = 5,             # # of imputations, usually 5, in pol346 1 ok
  bounds = bounds_matrix, # tell amelia to bound some vars
  idvars = ignore_vars    # vars to leave out of imputation
  )
## amelia uses multiple imputation and creates 5 imputations
## code below extracts imputation 1
risk_imp1 <- risk_amelia_out$imputations$imp1
dim(risk_imp1)
  1. Evaluate the multiple imputation

Additional resources