library(haven)
# create table
library(gtsummary)
# data wrangling and other functions
library(tidyverse)
# if you want to have a output similar to stata
library(tidylog)
#cox
library(survival)
# plots survival
library(ggsurvfit)
# Epi - additional splitting
library(Epi)
# Limit significant digits to 2, remove scientific notation
options(digits = 3, scipen = 9)
papua_raw <- read_dta("datasets/pngnew.dta") |>
mutate(across(where(is.labelled),as_factor))Additional Topics
This document cover some additional data wrangling steps need to do something in R, when compared to STATA
Splitting in multiple time scales
In R, it is very easy to split data in one time scale using survival package. However, to do a split in multiple time scales using survival is more complicated. It has a simple way using Epi package.
Epi
Let’s see how to do it using Epi (easy way)
# Lets define a object with all time variables as numeric
papua_epi <- papua_raw |>
mutate(
timein_calyear = cal.yr(timein, format="%Y-%m-%d"),
timeout_calyear = cal.yr(timeout, format ="%Y-%m-%d"),
dob_calyear = cal.yr(dob, format = "%Y-%m-%d")
)
# Define as Lexis object with timescales calendar time and age
lexis_exp <- Lexis(entry = list( period=timein_calyear ),
exit = list( period=timeout_calyear, age=timeout_calyear-dob_calyear),
exit.status = any,
data = papua_epi)NOTE: entry.status has been set to 0 for all.
# example of splitting
# now you can split by different cutoffs in each timescale
# Split time along two time-axes
lexis_split_per <- splitLexis( lexis_exp, breaks = c(1983), time.scale="period")
lexis_split_per_age<- splitLexis( lexis_split_per, breaks = c(3,6), time.scale="age" )
lexis_split_per_age |> as_tibble() |> head()| lex.id | period | age | lex.dur | lex.Cst | lex.Xst | id | sex | vacc | any | anyprev | timein | timeout | dob | datevac | timein_calyear | timeout_calyear | dob_calyear |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1982 | 5.84 | 0.157 | 0 | 0 | 1077 | 1 | 2 | 0 | 0 | 1982-02-17 | 1983-06-13 | 1976-04-15 | 1982-02-17 | 1982 | 1983 | 1976 |
| 1 | 1982 | 6.00 | 0.714 | 0 | 0 | 1077 | 1 | 2 | 0 | 0 | 1982-02-17 | 1983-06-13 | 1976-04-15 | 1982-02-17 | 1982 | 1983 | 1976 |
| 1 | 1983 | 6.71 | 0.446 | 0 | 0 | 1077 | 1 | 2 | 0 | 0 | 1982-02-17 | 1983-06-13 | 1976-04-15 | 1982-02-17 | 1982 | 1983 | 1976 |
| 2 | 1982 | 5.13 | 0.867 | 0 | 0 | 2228 | 2 | 2 | 0 | 0 | 1981-10-27 | 1983-01-14 | 1976-09-08 | 1981-10-27 | 1982 | 1983 | 1977 |
| 2 | 1983 | 6.00 | 0.314 | 0 | 0 | 2228 | 2 | 2 | 0 | 0 | 1981-10-27 | 1983-01-14 | 1976-09-08 | 1981-10-27 | 1982 | 1983 | 1977 |
| 2 | 1983 | 6.31 | 0.035 | 0 | 0 | 2228 | 2 | 2 | 0 | 0 | 1981-10-27 | 1983-01-14 | 1976-09-08 | 1981-10-27 | 1982 | 1983 | 1977 |
This new dataset has the columns period, age, lex.dur, lex.cst and lex.xst - these are the variables that are updated through the process.
period is the calendar period
age is the age
lex.dur is the pyears in that period
lex.Cst is the status (enter) of the individual in that period
lex.Xst is the status (exit) of the individual in that period
survival
Let’s see how to do it using survival
# Lets define a object with only one entry per person
z_int <- papua_raw |>
group_by(id) |>
summarise(entry = (min(timein)),
exit = (max(timeout)),
dob = unique(dob),
timestudy = exit-entry)
z <- tmerge(z_int,z_int,id=id, tstart=entry,
tstop=exit)
z <- tmerge(z,
papua_raw,
id = id,
event_end = event(timeout, any))
# now you need to break by calendar
breaks_cal <- c(as.Date("1983-01-01"))
for (i in 1:length(breaks_cal) ){
t <- breaks_cal[i]
z_df <- z_int |> mutate(period_end_date = t)
z <- tmerge(z,z_df, id=id, per1=tdc(period_end_date))
names(z)[names(z)=="per1"] <- paste0("p_",i)
}
# now you need to break by calendar
breaks_age <- c(3*365.25,6*365.25)
for (i in 1:length(breaks_age) ){
t <- breaks_age[i]
z_df <- z_int |> mutate(age_end_date = dob + t)
z <- tmerge(z,z_df, id=id, per1=tdc(age_end_date))
names(z)[names(z)=="per1"] <- paste0("age_",i)
}
z |> dplyr::select(id,tstart, tstop,dob, event_end) |>
mutate(
age = as.numeric(tstart - dob)/365.25,
pyrs = as.numeric(tstop - tstart)/365.25) |>
as_tibble() |> head()| id | tstart | tstop | dob | event_end | age | pyrs |
|---|---|---|---|---|---|---|
| 1077 | 1982-02-17 | 1982-04-15 | 1976-04-15 | 0 | 5.84 | 0.157 |
| 1077 | 1982-04-15 | 1983-01-01 | 1976-04-15 | 0 | 6.00 | 0.713 |
| 1077 | 1983-01-01 | 1983-06-13 | 1976-04-15 | 0 | 6.71 | 0.446 |
| 2228 | 1981-10-27 | 1982-09-08 | 1976-09-08 | 0 | 5.13 | 0.867 |
| 2228 | 1982-09-08 | 1983-01-01 | 1976-09-08 | 0 | 6.00 | 0.313 |
| 2228 | 1983-01-01 | 1983-01-14 | 1976-09-08 | 0 | 6.31 | 0.036 |
You can see it match the output from Epi.
Calculate Overall Incidence Rate
Using cluster SE
Let’s see how to calculate an overall incidence rate when the data is clustered.
papua_inc <- papua_raw |>
mutate(pyears = as.numeric((timeout-timein))/365.25) #using in years to match the common pattern in the stset scale=365.25We can calculate this using a Poisson Model (valid point estimate) and using a cluster standard error (it’s different than “robust standard error” alone)
model_inc <- glm(any ~ 1+offset(log(pyears)),
family = poisson,
data = papua_inc)
parameters::model_parameters(model_inc,
vcov = "vcovCL",
vcov_args = list(cluster = papua_inc$id),
exponentiate=T)| Parameter | Coefficient | SE | CI | CI_low | CI_high | z | df_error | p |
|---|---|---|---|---|---|---|---|---|
| (Intercept) | 0.944 | 0.029 | 0.95 | 0.889 | 1 | -1.9 | Inf | 0.057 |
Check the difference without cluster SE
parameters::model_parameters(model_inc,
exponentiate=T)| Parameter | Coefficient | SE | CI | CI_low | CI_high | z | df_error | p |
|---|---|---|---|---|---|---|---|---|
| (Intercept) | 0.944 | 0.02 | 0.95 | 0.906 | 0.984 | -2.74 | Inf | 0.006 |
and now robust SE
parameters::model_parameters(model_inc,
vcov = "HC0",
exponentiate=T)| Parameter | Coefficient | SE | CI | CI_low | CI_high | z | df_error | p |
|---|---|---|---|---|---|---|---|---|
| (Intercept) | 0.944 | 0.023 | 0.95 | 0.9 | 0.989 | -2.41 | Inf | 0.016 |
Using GEE
Using geepack package (or gee). Using a independence correlation matrix (same answer from the Poisson with cluster SE).
library(geepack)
# make sure the dataset is ordered by id
papua_inc <- papua_inc |>
arrange(id)
gee_inc <- geepack::geeglm(any ~ 1+offset(log(pyears)),
id = id,
corstr = "independence",
family = poisson,
data = papua_inc)
parameters::model_parameters(gee_inc, exponentiate=T) | Parameter | Coefficient | SE | CI | CI_low | CI_high | Chi2 | df_error | p |
|---|---|---|---|---|---|---|---|---|
| (Intercept) | 0.944 | 0.029 | 0.95 | 0.889 | 1 | 3.62 | 1 | 0.057 |
Warning - Random Effects model and Intercept
Remember from the class about Analysis of correlated outcome that a RE model is a “conditional” model. So the simple model with a intercept + random intercept will have a “conditional” interpretation, i.e., the fixed effect intercept depends on the RE intercept.
library(lme4)
re_inc <- glmer(any ~ 1+offset(log(pyears)) + (1|id),
data = papua_inc,
family = "poisson")
parameters::model_parameters(re_inc, exponentiate=T) | Parameter | Coefficient | SE | CI | CI_low | CI_high | z | df_error | p | Effects | Group |
|---|---|---|---|---|---|---|---|---|---|---|
| (Intercept) | 0.720 | 0.027 | 0.95 | 0.669 | 0.774 | -8.89 | Inf | 0 | fixed | |
| SD (Intercept) | 0.822 | NA | 0.95 | NA | NA | NA | NA | NA | random | id |
# 1a. Fixed‐intercept estimate (conditional on random effect = 0)
beta0 <- fixef(re_inc)[1]
# 1b. Random‐intercept variance
sigma2 <- as.data.frame(VarCorr(re_inc))$vcov[1]
# 2. Incidence‐rate estimates
# • Cluster‐specific (conditional) IR = exp(beta0)
IR_cond <- exp(beta0)
# • Marginal (population‐average) IR ≈ exp(beta0 + σ2/2)
IR_marg <- exp(beta0 + sigma2/2)If we accounted for the values from the Random effects intercept, we can get a incidence rate of 1.009