12: Missing data

Author

Thiago Cerqueira Silva

Incomplete solutions (only question 1.2 and 1.3)

Show the code
library(haven) 
# create table
library(gtsummary)
# data wrangling and other functions
library(tidyverse)
# if you want to have a output similar to stata
library(tidylog)
#imputation
library(mice)

# Limit significant digits to 4, remove scientific notation (most of the time, scipen=999 removes all time)
options(digits = 4, scipen = 9)

Q1.2

How many observations are there in total in the data set (use dis N or browse the data)?

Show the code
melb <- haven::read_dta("datasets/NCDS_melbourne.dta")

There is nrow(melb) observations

How many observations are used in the complete case analysis?

Show the code
no_obs <- melb |> 
  drop_na(noqual2,
          care,
          soch7,
          invbwt,
          mo_age) |> 
  nrow()
melb <- melb |> 
    mutate(
    care = as.factor(care),
    soch7 = as.factor(soch7),
    noqual2 = as.factor(noqual2),
    invbwt = as.numeric(invbwt)
  )
complete_case_model <- glm(noqual2~ care + soch7 +invbwt+ mo_age, family="binomial",
                           data=melb)
tbl_complete <- tbl_regression(complete_case_model,
               estimate_fun = purrr::partial(style_ratio, digits = 3)) |> 
  modify_column_unhide(column = std.error)
tbl_complete
Characteristic log(OR) SE 95% CI p-value
care



    0
    1 1.093 0.157 0.788, 1.402 <0.001
soch7



    0
    1 0.947 0.046 0.858, 1.037 <0.001
invbwt 123.9 14.2 96.23, 151.7 <0.001
mother's age at birth (centered around 28) -0.007 0.004 -0.015, 0.001 0.10
Abbreviations: CI = Confidence Interval, OR = Odds Ratio, SE = Standard Error

There is no_obs observations in complete case analysis

What percentage of records are incomplete?

There is no_obs/nrow(melb) observations are incomplete

Under what assumption would inference from the complete records be valid? Do you believe this assumption?

The probability of a complete record is not associated with qualification status, given the covariates in the model. This may be plausible, since qualification status is observed at age 23. However, it is also possible that having data at age 23 may be related to educational qualifications, for example people with educational qualifications may be more likely to move away from home and drop out of the study.

Use observed data to investigate this assumption. Does the assumption hold?

Show the code
# Select the variables you want to impute/missing info (algo including the outcome even if there is no missing in the outcome)
data.to.impute<- melb |> 
  select(
    noqual2,
          care,
          soch7,
          invbwt,
          mo_age
  )
md.pattern(data.to.impute, plot = T, rotate.names = T)

      mo_age invbwt care soch7 noqual2      
10279      1      1    1     1       1     0
3324       1      1    1     1       0     1
59         1      1    1     0       1     1
64         1      1    1     0       0     2
3          1      1    0     1       1     1
1          1      1    0     1       0     2
1153       1      1    0     0       1     2
1886       1      1    0     0       0     3
349        1      0    1     1       1     1
116        1      0    1     1       0     2
3          1      0    1     0       1     2
3          1      0    1     0       0     3
1          1      0    0     1       1     2
37         1      0    0     0       1     3
124        1      0    0     0       0     4
6          0      1    1     1       1     1
7          0      1    1     1       0     2
1          0      1    0     0       0     4
109        0      0    1     1       1     2
37         0      0    1     1       0     3
1          0      0    1     0       1     3
3          0      0    1     0       0     4
44         0      0    0     0       1     4
21         0      0    0     0       0     5
         229    848 3271  3399    5587 13334
Show the code
dt_indicator <- data.to.impute |> 
  mutate(
    complete_case = if_else(
      !is.na(noqual2) &
        !is.na(soch7) &
      !is.na(invbwt) &
        !is.na(mo_age)&
      !is.na(care),
      0,
      1
    )
  )
mod_ind <- glm(complete_case ~ noqual2 + invbwt+mo_age, family = binomial, data=dt_indicator)

tbl_regression(mod_ind)
Characteristic log(OR) 95% CI p-value
noqual2


    0
    1 0.17 0.04, 0.30 0.008
invbwt 32 -4.5, 68 0.081
mother's age at birth (centered around 28) -0.02 -0.03, -0.01 0.002
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

Can we determine the missingness mechanism from the dataset?

No, we have found evidence that missingness is associated with a number of the other variables in the dataset. We therefore have evidence against Missing Completely at Random (MCAR). However, we cannot from the observed data determine whether the values are Missing at Random (MAR) or Missing Not at Random (MNAR).

Take note this regression is only taking into account the observations with non-missing in the variables included

Q 1.3

We will now use R’s multiple imputation In this example, we are going to impute missing values in more than one partially observed variable. Using the library mice (R has many other options to do MI). mice stands for Multivariate Imputation by Chained Equations

Show the code
glimpse(data.to.impute) # check the class of the variables ### Important
Rows: 17,631
Columns: 5
$ noqual2 <fct> NA, 1, 1, 0, 0, 1, 0, NA, NA, NA, 0, 1, 1, 1, 0, NA, 0, NA, 0,…
$ care    <fct> NA, NA, 0, 0, NA, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, NA, NA, NA…
$ soch7   <fct> NA, NA, 1, 1, NA, 1, 0, 1, NA, 1, 0, 0, 0, 0, 0, 0, NA, NA, NA…
$ invbwt  <dbl> 0.008475, 0.008403, 0.008065, 0.010000, 0.007937, 0.010417, 0.…
$ mo_age  <dbl> -7, -7, -6, -8, -7, -11, -6, -10, -8, -9, -6, -6, -8, -8, -6, …
Show the code
# Run a dry imp (no iteration) to get a predction matrix
imp <- mice(data.to.impute, maxit=0)

# Extract predictorMatrix and methods of imputation 

predM <- imp$predictorMatrix
meth <- imp$method

# you can exclude a variable from the imputation model using this
#predM[, c("invbwt")] <- 0
# you can change the imputation method for each variable like this
meth["invbwt"] <- "norm" #linear regression, the default value would be pmn -Predictive mean matching
meth["mo_age"] <- "norm" #linear regression
#  solution run 10 iterations and 15 datasets - usually you can get good results with 5 and 10 datasets and iterations (but it depends!)
# you can reduce it to save computation time (in your practical)
imp2 <- mice(data.to.impute, maxit = 10,  # equivalent to "burn"
             m= 15, #number of datasets
             predictorMatrix = predM, 
             method = meth, print =  F,
             seed = 123)

You can see there as lot of logged events all related to invbwt. All of them saying that the variable was removed from the imputation. Lets see the impact it has

Show the code
#Then fit the model to each imputed data set:
fit.ncds<-with(data=imp2,exp=glm(noqual2~ care + soch7 +invbwt+ mo_age, family="binomial"))
#summary(pool(fit.ncds))
tbl_imp <- tbl_regression(fit.ncds, estimate_fun = purrr::partial(style_ratio, digits = 3))|> 
  modify_column_unhide(column = std.error)

tbl_merge(
  list(tbl_complete, tbl_imp),
  tab_spanner = c("**Complete Case**","**Imputation**")
)
Characteristic
Complete Case
Imputation
log(OR) SE 95% CI p-value log(OR) SE 95% CI p-value
care







    0

    1 1.093 0.157 0.788, 1.402 <0.001 1.199 0.153 0.891, 1.506 <0.001
soch7







    0

    1 0.947 0.046 0.858, 1.037 <0.001 0.948 0.052 0.843, 1.053 <0.001
invbwt 123.9 14.2 96.23, 151.7 <0.001 26.27 7.58 11.22, 41.33 <0.001
mother's age at birth (centered around 28) -0.007 0.004 -0.015, 0.001 0.10 -0.007 0.004 -0.014, 0.001 0.078
Abbreviations: CI = Confidence Interval, OR = Odds Ratio, SE = Standard Error

As we can see, the INVBWT coefficient doesnt resemble the one from the complete case analysis, that is because the relationship between invbwt and the other variables was lost (because it was dropped)

Solution

The probably reason of it not running is because the low variability of that variable, we can keep it but lets multiple by 100.

Show the code
data.to.impute<- melb |> 
  select(
    noqual2,
          care,
          soch7,
          invbwt,
          mo_age
  ) |> 
  mutate(
    invbwt_100 = 100*invbwt
  ) |> 
  select(-invbwt) #remove original because collinear

# Run a dry imp (no iteration) to get a predction matrix
imp <- mice(data.to.impute, maxit=0)

# Extract predictorMatrix and methods of imputation 

predM <- imp$predictorMatrix
meth <- imp$method

# We will need to say to only use invbwt_100 in the imputation

meth["mo_age"] <- "norm" #linear regression
meth["invbwt_100"] <- "norm" #linear regression
#  solution run 10 iterations and 15 datasets - usually you can get good results with 5 and 10 datasets and iterations (but it depends!)
# you can reduce it to save computation time (in your practical)
imp2 <- mice(data.to.impute, maxit = 10,  # equivalent to "burn"
             m= 15, #number of datasets
             predictorMatrix = predM, 
             method = meth, print =  F,
             seed = 123)
Show the code
#Then fit the model to each imputed data set:
complete_case_model <- glm(noqual2~ care + soch7 +invbwt_100+ mo_age, family="binomial",
                           data=data.to.impute)
tbl_complete <- tbl_regression(complete_case_model,
               estimate_fun = purrr::partial(style_ratio, digits = 3)) |> 
  modify_column_unhide(column = std.error)

fit.ncds<-with(data=imp2,exp=glm(noqual2~ care + soch7 +invbwt_100+ mo_age, family="binomial"))
#summary(pool(fit.ncds))
tbl_imp <- tbl_regression(fit.ncds, estimate_fun = purrr::partial(style_ratio, digits = 3))|> 
  modify_column_unhide(column = std.error)

tbl_merge(
  list(tbl_complete, tbl_imp),
  tab_spanner = c("**Complete Case**","**Imputation**")
)
Characteristic
Complete Case
Imputation
log(OR) SE 95% CI p-value log(OR) SE 95% CI p-value
care







    0

    1 1.093 0.157 0.788, 1.402 <0.001 1.092 0.140 0.813, 1.371 <0.001
soch7







    0

    1 0.947 0.046 0.858, 1.037 <0.001 0.947 0.045 0.857, 1.037 <0.001
invbwt_100 1.239 0.142 0.962, 1.517 <0.001 1.173 0.127 0.916, 1.429 <0.001
mother's age at birth (centered around 28) -0.007 0.004 -0.015, 0.001 0.10 -0.007 0.004 -0.015, 0.001 0.075
Abbreviations: CI = Confidence Interval, OR = Odds Ratio, SE = Standard Error