Additional Topics

Author

Thiago Cerqueira Silva

This document cover some additional data wrangling steps need to do something in R, when compared to STATA

library(haven) 
# create table
library(gtsummary)
# data wrangling and other functions
library(tidyverse)
# if you want to have a output similar to stata
library(tidylog)
#cox
library(survival)
# plots survival
library(ggsurvfit)
# Epi - additional splitting
library(Epi)
# Limit significant digits to 2, remove scientific notation
options(digits = 3, scipen = 9)
papua_raw <- read_dta("datasets/pngnew.dta") |> 
  mutate(across(where(is.labelled),as_factor))

Splitting in multiple time scales

In R, it is very easy to split data in one time scale using survival package. However, to do a split in multiple time scales using survival is more complicated. It has a simple way using Epi package.

Epi

Let’s see how to do it using Epi (easy way)

# Lets define a object with all time variables as numeric
papua_epi <- papua_raw |> 
  mutate(
    timein_calyear = cal.yr(timein, format="%Y-%m-%d"),
    timeout_calyear = cal.yr(timeout, format ="%Y-%m-%d"),
    dob_calyear = cal.yr(dob, format = "%Y-%m-%d")
  )
# Define as Lexis object with timescales calendar time and age
lexis_exp <- Lexis(entry = list( period=timein_calyear ),
               exit = list( period=timeout_calyear, age=timeout_calyear-dob_calyear),
               exit.status = any,
               data = papua_epi)

NOTE: entry.status has been set to 0 for all.

# example of splitting
# now you can split by different cutoffs in each timescale
# Split time along two time-axes
lexis_split_per <- splitLexis( lexis_exp, breaks = c(1983), time.scale="period")
lexis_split_per_age<- splitLexis( lexis_split_per, breaks = c(3,6), time.scale="age" )
lexis_split_per_age |> as_tibble() |> head()

lex.id	period	age	lex.dur	id	sex	vacc	timein	timeout	dob	datevac	timein_calyear	timeout_calyear	dob_calyear
1	1982	5.84	0.157	1077	1	2	1982-02-17	1983-06-13	1976-04-15	1982-02-17	1982	1983	1976
1	1982	6.00	0.714	1077	1	2	1982-02-17	1983-06-13	1976-04-15	1982-02-17	1982	1983	1976
1	1983	6.71	0.446	1077	1	2	1982-02-17	1983-06-13	1976-04-15	1982-02-17	1982	1983	1976
2	1982	5.13	0.867	2228	2	2	1981-10-27	1983-01-14	1976-09-08	1981-10-27	1982	1983	1977
2	1983	6.00	0.314	2228	2	2	1981-10-27	1983-01-14	1976-09-08	1981-10-27	1982	1983	1977
2	1983	6.31	0.035	2228	2	2	1981-10-27	1983-01-14	1976-09-08	1981-10-27	1982	1983	1977

Coding explanation

This new dataset has the columns period, age, lex.dur, lex.cst and lex.xst - these are the variables that are updated through the process.

period is the calendar period

age is the age

lex.dur is the pyears in that period

lex.Cst is the status (enter) of the individual in that period

lex.Xst is the status (exit) of the individual in that period

survival

Let’s see how to do it using survival

# Lets define a object with only one entry per person
z_int <- papua_raw |> 
  group_by(id) |> 
  summarise(entry = (min(timein)),
            exit = (max(timeout)),
            dob = unique(dob),
            timestudy = exit-entry)
z <- tmerge(z_int,z_int,id=id, tstart=entry,
            tstop=exit)

z <- tmerge(z,
            papua_raw,
            id = id,
            event_end = event(timeout, any))
# now you need to break by calendar
breaks_cal <- c(as.Date("1983-01-01"))

for (i in 1:length(breaks_cal) ){
  t <- breaks_cal[i]
  z_df <- z_int  |>  mutate(period_end_date = t)
  z <- tmerge(z,z_df, id=id, per1=tdc(period_end_date))
  names(z)[names(z)=="per1"] <- paste0("p_",i)
}


# now you need to break by calendar
breaks_age <- c(3*365.25,6*365.25)

for (i in 1:length(breaks_age) ){
  t <- breaks_age[i]
  z_df <- z_int  |>  mutate(age_end_date = dob + t)
  z <- tmerge(z,z_df, id=id, per1=tdc(age_end_date))
  names(z)[names(z)=="per1"] <- paste0("age_",i)
}
z |> dplyr::select(id,tstart, tstop,dob, event_end) |>
  mutate(
    age = as.numeric(tstart - dob)/365.25,
    pyrs = as.numeric(tstop - tstart)/365.25) |> 
  as_tibble() |> head()

id	tstart	tstop	dob	age	pyrs
1077	1982-02-17	1982-04-15	1976-04-15	5.84	0.157
1077	1982-04-15	1983-01-01	1976-04-15	6.00	0.713
1077	1983-01-01	1983-06-13	1976-04-15	6.71	0.446
2228	1981-10-27	1982-09-08	1976-09-08	5.13	0.867
2228	1982-09-08	1983-01-01	1976-09-08	6.00	0.313
2228	1983-01-01	1983-01-14	1976-09-08	6.31	0.036

You can see it match the output from Epi.

Calculate Overall Incidence Rate

Using cluster SE

Let’s see how to calculate an overall incidence rate when the data is clustered.

papua_inc <- papua_raw |> 
  mutate(pyears = as.numeric((timeout-timein))/365.25) #using in years to match the common pattern in the stset scale=365.25

We can calculate this using a Poisson Model (valid point estimate) and using a cluster standard error (it’s different than “robust standard error” alone)

model_inc <- glm(any ~ 1+offset(log(pyears)),
             family = poisson,
             data = papua_inc)
parameters::model_parameters(model_inc,  
                             vcov = "vcovCL",
                             vcov_args = list(cluster = papua_inc$id),
                             exponentiate=T)

Parameter	Coefficient	SE	CI	CI_low	CI_high	z	df_error	p
(Intercept)	0.944	0.029	0.95	0.889	1	-1.9	Inf	0.057

Check the difference without cluster SE

parameters::model_parameters(model_inc,
                             exponentiate=T)

Parameter	Coefficient	SE	CI	CI_low	CI_high	z	df_error	p
(Intercept)	0.944	0.02	0.95	0.906	0.984	-2.74	Inf	0.006

and now robust SE

parameters::model_parameters(model_inc,  
                             vcov = "HC0",
                             exponentiate=T)

Parameter	Coefficient	SE	CI	CI_low	CI_high	z	df_error	p
(Intercept)	0.944	0.023	0.95	0.9	0.989	-2.41	Inf	0.016

Using GEE

Using geepack package (or gee). Using a independence correlation matrix (same answer from the Poisson with cluster SE).

library(geepack)
# make sure the dataset is ordered by id
papua_inc <- papua_inc |> 
  arrange(id)
  gee_inc <- geepack::geeglm(any ~ 1+offset(log(pyears)),
                       id = id,
                       corstr = "independence",
             family = poisson,
             data = papua_inc)

  parameters::model_parameters(gee_inc, exponentiate=T)

Parameter	Coefficient	SE	CI	CI_low	CI_high	Chi2	df_error	p
(Intercept)	0.944	0.029	0.95	0.889	1	3.62	1	0.057

Warning - Random Effects model and Intercept

Remember from the class about Analysis of correlated outcome that a RE model is a “conditional” model. So the simple model with a intercept + random intercept will have a “conditional” interpretation, i.e., the fixed effect intercept depends on the RE intercept.

library(lme4)
re_inc <- glmer(any ~ 1+offset(log(pyears)) + (1|id),
                    data = papua_inc,
                    family = "poisson")
parameters::model_parameters(re_inc, exponentiate=T)

Parameter	Coefficient	SE	CI	CI_low	CI_high	z	df_error	p	Effects	Group
(Intercept)	0.720	0.027	0.95	0.669	0.774	-8.89	Inf	0	fixed
SD (Intercept)	0.822	NA	0.95	NA	NA	NA	NA	NA	random	id

# 1a. Fixed‐intercept estimate (conditional on random effect = 0)
beta0 <- fixef(re_inc)[1]

# 1b. Random‐intercept variance
sigma2 <- as.data.frame(VarCorr(re_inc))$vcov[1]

# 2. Incidence‐rate estimates
# • Cluster‐specific (conditional) IR  = exp(beta0)
IR_cond   <- exp(beta0)
# • Marginal (population‐average) IR ≈ exp(beta0 + σ2/2)
IR_marg   <- exp(beta0 + sigma2/2)

If we accounted for the values from the Random effects intercept, we can get a incidence rate of 1.009