Additional Topics

Author

Thiago Cerqueira Silva

This document cover some additional data wrangling steps need to do something in R, when compared to STATA

library(haven) 
# create table
library(gtsummary)
# data wrangling and other functions
library(tidyverse)
# if you want to have a output similar to stata
library(tidylog)
#cox
library(survival)
# plots survival
library(ggsurvfit)
# Epi - additional splitting
library(Epi)
# Limit significant digits to 2, remove scientific notation
options(digits = 3, scipen = 9)
papua_raw <- read_dta("datasets/pngnew.dta") |> 
  mutate(across(where(is.labelled),as_factor))

Splitting in multiple time scales

In R, it is very easy to split data in one time scale using survival package. However, to do a split in multiple time scales using survival is more complicated. It has a simple way using Epi package.

Epi

Let’s see how to do it using Epi (easy way)

# Lets define a object with all time variables as numeric
papua_epi <- papua_raw |> 
  mutate(
    timein_calyear = cal.yr(timein, format="%Y-%m-%d"),
    timeout_calyear = cal.yr(timeout, format ="%Y-%m-%d"),
    dob_calyear = cal.yr(dob, format = "%Y-%m-%d")
  )
# Define as Lexis object with timescales calendar time and age
lexis_exp <- Lexis(entry = list( period=timein_calyear ),
               exit = list( period=timeout_calyear, age=timeout_calyear-dob_calyear),
               exit.status = any,
               data = papua_epi)
NOTE: entry.status has been set to 0 for all.
# example of splitting
# now you can split by different cutoffs in each timescale
# Split time along two time-axes
lexis_split_per <- splitLexis( lexis_exp, breaks = c(1983), time.scale="period")
lexis_split_per_age<- splitLexis( lexis_split_per, breaks = c(3,6), time.scale="age" )
lexis_split_per_age |> as_tibble() |> head()
lex.id period age lex.dur lex.Cst lex.Xst id sex vacc any anyprev timein timeout dob datevac timein_calyear timeout_calyear dob_calyear
1 1982 5.84 0.157 0 0 1077 1 2 0 0 1982-02-17 1983-06-13 1976-04-15 1982-02-17 1982 1983 1976
1 1982 6.00 0.714 0 0 1077 1 2 0 0 1982-02-17 1983-06-13 1976-04-15 1982-02-17 1982 1983 1976
1 1983 6.71 0.446 0 0 1077 1 2 0 0 1982-02-17 1983-06-13 1976-04-15 1982-02-17 1982 1983 1976
2 1982 5.13 0.867 0 0 2228 2 2 0 0 1981-10-27 1983-01-14 1976-09-08 1981-10-27 1982 1983 1977
2 1983 6.00 0.314 0 0 2228 2 2 0 0 1981-10-27 1983-01-14 1976-09-08 1981-10-27 1982 1983 1977
2 1983 6.31 0.035 0 0 2228 2 2 0 0 1981-10-27 1983-01-14 1976-09-08 1981-10-27 1982 1983 1977
Coding explanation

This new dataset has the columns period, age, lex.dur, lex.cst and lex.xst - these are the variables that are updated through the process.

period is the calendar period

age is the age

lex.dur is the pyears in that period

lex.Cst is the status (enter) of the individual in that period

lex.Xst is the status (exit) of the individual in that period

survival

Let’s see how to do it using survival

# Lets define a object with only one entry per person
z_int <- papua_raw |> 
  group_by(id) |> 
  summarise(entry = (min(timein)),
            exit = (max(timeout)),
            dob = unique(dob),
            timestudy = exit-entry)
z <- tmerge(z_int,z_int,id=id, tstart=entry,
            tstop=exit)

z <- tmerge(z,
            papua_raw,
            id = id,
            event_end = event(timeout, any))
# now you need to break by calendar
breaks_cal <- c(as.Date("1983-01-01"))

for (i in 1:length(breaks_cal) ){
  t <- breaks_cal[i]
  z_df <- z_int  |>  mutate(period_end_date = t)
  z <- tmerge(z,z_df, id=id, per1=tdc(period_end_date))
  names(z)[names(z)=="per1"] <- paste0("p_",i)
}


# now you need to break by calendar
breaks_age <- c(3*365.25,6*365.25)

for (i in 1:length(breaks_age) ){
  t <- breaks_age[i]
  z_df <- z_int  |>  mutate(age_end_date = dob + t)
  z <- tmerge(z,z_df, id=id, per1=tdc(age_end_date))
  names(z)[names(z)=="per1"] <- paste0("age_",i)
}
z |> dplyr::select(id,tstart, tstop,dob, event_end) |>
  mutate(
    age = as.numeric(tstart - dob)/365.25,
    pyrs = as.numeric(tstop - tstart)/365.25) |> 
  as_tibble() |> head()
id tstart tstop dob event_end age pyrs
1077 1982-02-17 1982-04-15 1976-04-15 0 5.84 0.157
1077 1982-04-15 1983-01-01 1976-04-15 0 6.00 0.713
1077 1983-01-01 1983-06-13 1976-04-15 0 6.71 0.446
2228 1981-10-27 1982-09-08 1976-09-08 0 5.13 0.867
2228 1982-09-08 1983-01-01 1976-09-08 0 6.00 0.313
2228 1983-01-01 1983-01-14 1976-09-08 0 6.31 0.036

You can see it match the output from Epi.

Calculate Overall Incidence Rate

Using cluster SE

Let’s see how to calculate an overall incidence rate when the data is clustered.

papua_inc <- papua_raw |> 
  mutate(pyears = as.numeric((timeout-timein))/365.25) #using in years to match the common pattern in the stset scale=365.25

We can calculate this using a Poisson Model (valid point estimate) and using a cluster standard error (it’s different than “robust standard error” alone)

model_inc <- glm(any ~ 1+offset(log(pyears)),
             family = poisson,
             data = papua_inc)
parameters::model_parameters(model_inc,  
                             vcov = "vcovCL",
                             vcov_args = list(cluster = papua_inc$id),
                             exponentiate=T)
Parameter Coefficient SE CI CI_low CI_high z df_error p
(Intercept) 0.944 0.029 0.95 0.889 1 -1.9 Inf 0.057

Check the difference without cluster SE

parameters::model_parameters(model_inc,
                             exponentiate=T)
Parameter Coefficient SE CI CI_low CI_high z df_error p
(Intercept) 0.944 0.02 0.95 0.906 0.984 -2.74 Inf 0.006

and now robust SE

parameters::model_parameters(model_inc,  
                             vcov = "HC0",
                             exponentiate=T)
Parameter Coefficient SE CI CI_low CI_high z df_error p
(Intercept) 0.944 0.023 0.95 0.9 0.989 -2.41 Inf 0.016

Using GEE

Using geepack package (or gee). Using a independence correlation matrix (same answer from the Poisson with cluster SE).

library(geepack)
# make sure the dataset is ordered by id
papua_inc <- papua_inc |> 
  arrange(id)
  gee_inc <- geepack::geeglm(any ~ 1+offset(log(pyears)),
                       id = id,
                       corstr = "independence",
             family = poisson,
             data = papua_inc)

  parameters::model_parameters(gee_inc, exponentiate=T) 
Parameter Coefficient SE CI CI_low CI_high Chi2 df_error p
(Intercept) 0.944 0.029 0.95 0.889 1 3.62 1 0.057

Warning - Random Effects model and Intercept

Remember from the class about Analysis of correlated outcome that a RE model is a “conditional” model. So the simple model with a intercept + random intercept will have a “conditional” interpretation, i.e., the fixed effect intercept depends on the RE intercept.

library(lme4)
re_inc <- glmer(any ~ 1+offset(log(pyears)) + (1|id),
                    data = papua_inc,
                    family = "poisson")
parameters::model_parameters(re_inc, exponentiate=T) 
Parameter Coefficient SE CI CI_low CI_high z df_error p Effects Group
(Intercept) 0.720 0.027 0.95 0.669 0.774 -8.89 Inf 0 fixed
SD (Intercept) 0.822 NA 0.95 NA NA NA NA NA random id
# 1a. Fixed‐intercept estimate (conditional on random effect = 0)
beta0 <- fixef(re_inc)[1]

# 1b. Random‐intercept variance
sigma2 <- as.data.frame(VarCorr(re_inc))$vcov[1]

# 2. Incidence‐rate estimates
# • Cluster‐specific (conditional) IR  = exp(beta0)
IR_cond   <- exp(beta0)
# • Marginal (population‐average) IR ≈ exp(beta0 + σ2/2)
IR_marg   <- exp(beta0 + sigma2/2)

If we accounted for the values from the Random effects intercept, we can get a incidence rate of 1.009