Lab review from Day 2
We first set up some libraries and define an analysis dataset:
library(biostat3)
library(dplyr) # for data manipulation
## Create a new dataset with only localised cancer
melanoma.l <- filter(biostat3::melanoma, stage=="Localised") |>
mutate(death_cancer = as.numeric(status=="Dead: cancer") )
We then estimate the Kaplan-Meier survival curves and describe the smoothed hazard rates:
## Create a fitted object for our subcohort
## using survfit
sfit7a1 <- survfit(Surv(surv_mm/12, event=death_cancer) ~ year8594,
data = melanoma.l )
par(mfrow=c(1,2)) ## Two graphs in the same window
plot(sfit7a1, # Kaplan-Meier survival curves
mark.time=F,
xlab="Years since diagnosis",
ylab="S(t)",
col=c("blue","red"),
lty=c("solid","dashed"))
plot(muhaz2(Surv(surv_mm/12, death_cancer) ~ year8594, data=melanoma.l),
xlab="Years since diagnosis", col=c("blue","red"),lty=c("solid","dashed"))
However, if we calculate average rates for the two periods, we see that the average rates across follow-up is higher in the later calendar period – which is inconsistent with the survival and hazard curves:
## average rates _over the observed follow-up_
survRate(Surv(surv_mm, death_cancer) ~ year8594, data=melanoma.l)
year8594 tstop event rate lower year8594=Diagnosed 75-84 Diagnosed 75-84 271953.5 572 0.002103301 0.001934444 year8594=Diagnosed 85-94 Diagnosed 85-94 191565.5 441 0.002302085 0.002092214 upper year8594=Diagnosed 75-84 0.002282950 year8594=Diagnosed 85-94 0.002527306
This is an example of confounding by time. If we restrict the analysis to the first ten years of follow-up:
melanoma.l2 <- mutate(melanoma.l,
## Update the death indicator (only count deaths within 120 months)
death_cancer = ifelse(surv_mm<=120 & death_cancer,1,0),
## Create a new time variable
surv_mm = ifelse(surv_mm<=120, surv_mm, 120)
)
## Calculate the rates on the truncated data
survRate(Surv(surv_mm, death_cancer) ~ year8594, data=melanoma.l2)
year8594 tstop event rate lower year8594=Diagnosed 75-84 Diagnosed 75-84 198012.5 519 0.002621047 0.002400371 year8594=Diagnosed 85-94 Diagnosed 85-94 190507.5 441 0.002314869 0.002103833 upper year8594=Diagnosed 75-84 0.002856555 year8594=Diagnosed 85-94 0.002541342
- This is now more consistent with the hazard curves.
- We can modelling these follow-up restricted rates using Poisson regression:
poisson7c <- glm( death_cancer ~ year8594 + offset( log( surv_mm/12) ), family=poisson, data=melanoma.l2 )
summary( poisson7c )
## IRR
eform(poisson7c)
Call: glm(formula = death_cancer ~ year8594 + offset(log(surv_mm/12)), family = poisson, data = melanoma.l2) Deviance Residuals: Min 1Q Median 3Q Max -0.7931 -0.7568 -0.5632 -0.3081 3.0550 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -3.45927 0.04389 -78.812 <2e-16 *** year8594Diagnosed 85-94 -0.12422 0.06476 -1.918 0.0551 . --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for poisson family taken to be 1) Null deviance: 4814.3 on 5317 degrees of freedom Residual deviance: 4810.6 on 5316 degrees of freedom AIC: 6734.6 Number of Fisher Scoring iterations: 6 exp(beta) 2.5 % 97.5 % (Intercept) 0.03145256 0.02885988 0.03427816 year8594Diagnosed 85-94 0.88318515 0.77790501 1.00271370
The regression model can be represented by:
$$ \begin{align*} E(\text{rate} | \text{year8594}) &= \exp(\beta_0 + \beta_1 I(\text{year8594}=\text{"Diagnosed 85-94"})) \end{align*} $$where \(\beta_0\) is the intercept term, which is the log rate in the first calendar period, and \(beta_1\) is the log rate ratio comparing the second calendar period with the first calendar period.
Equivalently, we can include the person-time (pt
) or an offset of the log person-time in the regression model for the expected counts:
## Add a new variable for year
melanoma.l2 <- mutate( melanoma.l2, surv_yy1 = surv_mm/12)
## Split follow up by year
melanoma.spl <- survSplit(melanoma.l2, cut=0:9, end="surv_yy1", start="start",
event="death_cancer") |>
## Calculate persontime and
## recode start time as a factor
mutate(pt = surv_yy1 - start,
fu = as.factor(start) )
## @knitr 7.e
## Calculate the incidence rate by observation year
yearly_rates <- survRate(Surv(pt/1000,death_cancer)~fu, data=melanoma.spl)
par(mfrow=c(1,2))
with(yearly_rates, matplot(as.numeric(as.character(fu))+0.5,
cbind(rate, lower,
upper),
ylim=c(0,max(upper)),
lty=c("solid","dashed","dashed"),
col=c("black","gray","gray"),
type="l",
ylab="Mortality rate per 1000 person-years",
xlab="Years since diagnosis") )
hazfit7f <- muhaz2(Surv(surv_mm/12, status == "Dead: cancer") ~ 1, data = melanoma.l)
## scale hazard by 1000
plot(hazfit7f, xlab="Years since diagnosis",
ylab="Hazard per 1000 person-years",col="blue",lty="solid", haz.scale=1000, xlim=c(0,10))
summary(poisson7h <- glm( death_cancer ~ fu + year8594 + offset( log(pt) ),
family = poisson,
data = melanoma.spl ))
## IRR
eform(poisson7h)
Call: glm(formula = death_cancer ~ fu + year8594 + offset(log(pt)), family = poisson, data = melanoma.spl) Deviance Residuals: Min 1Q Median 3Q Max -0.3297 -0.2900 -0.2207 -0.1761 3.6707 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -4.16612 0.12380 -33.651 < 2e-16 *** fu1 1.24352 0.13589 9.151 < 2e-16 *** fu2 1.25370 0.13797 9.087 < 2e-16 *** fu3 0.99738 0.14610 6.827 8.68e-12 *** fu4 0.79438 0.15532 5.115 3.14e-07 *** fu5 0.69247 0.16329 4.241 2.23e-05 *** fu6 0.45104 0.17911 2.518 0.011796 * fu7 0.10844 0.20710 0.524 0.600559 fu8 0.21049 0.20954 1.004 0.315139 fu9 -0.39239 0.27780 -1.413 0.157802 year8594Diagnosed 85-94 -0.24444 0.06579 -3.715 0.000203 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for poisson family taken to be 1) Null deviance: 8651.5 on 34308 degrees of freedom Residual deviance: 8432.6 on 34298 degrees of freedom AIC: 10375 Number of Fisher Scoring iterations: 6 exp(beta) 2.5 % 97.5 % (Intercept) 0.01551228 0.01217011 0.01977229 fu1 3.46780081 2.65693069 4.52614081 fu2 3.50326901 2.67320128 4.59108479 fu3 2.71116147 2.03608968 3.61005538 fu4 2.21306295 1.63225462 3.00054142 fu5 1.99864158 1.45125272 2.75249659 fu6 1.56993640 1.10515372 2.23018778 fu7 1.11453748 0.74268832 1.67256405 fu8 1.23427730 0.81855780 1.86112752 fu9 0.67543732 0.39185040 1.16425955 year8594Diagnosed 85-94 0.78314061 0.68839639 0.89092450
The regression model can be represented by:
\begin{align*} E(\text{rate} | \text{year8594}, \text{fu}) &= \exp(\beta_0 + \beta_1 I(\text{fu}=1) + ... \beta_9 I(\text{fu}=9) +\beta_{10} I(\text{year8594}=\text{"Diagnosed 85-94"})) \end{align*}We could add an interaction with follow-up time and calendar period of diagnosis:
melanoma.spl2 <- mutate(melanoma.spl, interaction = (start>=5 & year8594=="Diagnosed 85-94"))
summary(poisson7h2 <- glm(death_cancer ~ fu + year8594 + interaction +
offset(log(pt)), family=poisson, data=melanoma.spl2))
Call: glm(formula = death_cancer ~ fu + year8594 + interaction + offset(log(pt)), family = poisson, data = melanoma.spl2) Deviance Residuals: Min 1Q Median 3Q Max -0.3302 -0.2898 -0.2221 -0.1765 3.6716 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -4.16253 0.12504 -33.289 < 2e-16 *** fu1 1.24346 0.13590 9.150 < 2e-16 *** fu2 1.25348 0.13798 9.085 < 2e-16 *** fu3 0.99695 0.14611 6.823 8.90e-12 *** fu4 0.79371 0.15535 5.109 3.24e-07 *** fu5 0.67839 0.17771 3.817 0.000135 *** fu6 0.43812 0.19032 2.302 0.021334 * fu7 0.09704 0.21473 0.452 0.651322 fu8 0.20096 0.21483 0.935 0.349578 fu9 -0.39949 0.28002 -1.427 0.153683 year8594Diagnosed 85-94 -0.25115 0.07372 -3.407 0.000658 *** interactionTRUE 0.03284 0.16284 0.202 0.840175 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for poisson family taken to be 1) Null deviance: 8651.5 on 34308 degrees of freedom Residual deviance: 8432.6 on 34297 degrees of freedom AIC: 10377 Number of Fisher Scoring iterations: 6
The regression model can be represented by:
\begin{align*} E(\text{rate} | \text{year8594}, \text{fu}) &= \exp(\beta_0 + \beta_1 I(\text{fu}=1) + ... \beta_9 I(\text{fu}=9) +\beta_{10} I(\text{year8594}=\text{"Diagnosed 85-94"}) +\beta_{11} I(\text{year8594}=\text{"Diagnosed 85-94"} \&\ \text{start}\geq 5)) \end{align*}To calculate the rate ratio for the second calendar period from five-years of follow-up, we can write:
\begin{align*} & \frac{E(\text{rate} | \text{year8594}=\text{"Diagnosed 85-94"}, \text{fu}>5)}{E(\text{rate} | \text{year8594}=\text{"Diagnosed 75-84"}, \text{fu}>5)} \\ &= \frac{\exp(\beta_0 + \beta_i I(\text{fu}=i) + \beta_{10} +\beta_{11})}{\exp(\beta_0 + \beta_i I(\text{fu}=i))} \\ &= \exp(\beta_{10} +\beta_{11}) \end{align*} We can calculate the rate ratio and its 95% confidence interval using the lincom
function:
library(car)
lincom(poisson7h2,c("year8594Diagnosed 85-94 + interactionTRUE"))
Estimate 2.5 % 97.5 % year8594Diagnosed 85-94 + interactionTRUE -0.2183106 -0.5028869 0.06626558 Chisq Pr(>Chisq) year8594Diagnosed 85-94 + interactionTRUE 2.26073 0.1326915
Alternatively, we could re-parameterise the model:
melanoma.spl2 <- mutate(melanoma.spl,
effect1 = (start<5 & I(year8594)=="Diagnosed 85-94"),
effect2 = (start>=5 & I(year8594)=="Diagnosed 85-94"))
summary(glm(death_cancer ~ fu + effect1 + effect2 +
offset(log(pt)), family=poisson, data=melanoma.spl2))
Call: glm(formula = death_cancer ~ fu + effect1 + effect2 + offset(log(pt)), family = poisson, data = melanoma.spl2) Deviance Residuals: Min 1Q Median 3Q Max -0.3302 -0.2898 -0.2221 -0.1765 3.6716 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -4.16253 0.12504 -33.289 < 2e-16 *** fu1 1.24346 0.13590 9.150 < 2e-16 *** fu2 1.25348 0.13798 9.085 < 2e-16 *** fu3 0.99695 0.14611 6.823 8.90e-12 *** fu4 0.79371 0.15535 5.109 3.24e-07 *** fu5 0.67839 0.17771 3.817 0.000135 *** fu6 0.43812 0.19032 2.302 0.021334 * fu7 0.09704 0.21473 0.452 0.651322 fu8 0.20096 0.21483 0.935 0.349578 fu9 -0.39949 0.28002 -1.427 0.153683 effect1TRUE -0.25115 0.07372 -3.407 0.000658 *** effect2TRUE -0.21831 0.14519 -1.504 0.132691 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 (Dispersion parameter for poisson family taken to be 1) Null deviance: 8651.5 on 34308 degrees of freedom Residual deviance: 8432.6 on 34297 degrees of freedom AIC: 10377 Number of Fisher Scoring iterations: 6
We could also use a spline to model for time. Here we split more finely, fit a regression model, and plot the rates for the later calendar period:
library(splines)
time.cut <- seq(0,10,by=1/12)
melanoma.spl <- survSplit(Surv(surv_mm/12,status=="Dead: cancer")~.,
data=biostat3::melanoma,
cut=time.cut,
subset=stage=="Localised") |>
mutate(mid=(tstop+tstart)/2, risk_time=tstop-tstart)
poisson7m <- glm(event ~ ns(mid,df=4) + year8594 +
offset(log(risk_time)),
family=poisson,
data=melanoma.spl)
df <- data.frame(year8594="Diagnosed 85-94",
mid=time.cut[-1], risk_time=1000)
## plot the rate at the baseline values
plot(df$mid, predict(poisson7m, newdata=df, type="response"),
type="l", ylab="Rate per 1000 person-years",
xlab="Time since diagnosis (years)",
ylim=c(0,50))