Lab review from Day 2

We first set up some libraries and define an analysis dataset:

library(biostat3)
library(dplyr)    # for data manipulation
## Create a new dataset with only localised cancer
melanoma.l <- filter(biostat3::melanoma, stage=="Localised") |>
    mutate(death_cancer = as.numeric(status=="Dead: cancer") )

We then estimate the Kaplan-Meier survival curves and describe the smoothed hazard rates:

## Create a fitted object for our subcohort
## using survfit
sfit7a1 <- survfit(Surv(surv_mm/12, event=death_cancer) ~ year8594,
                   data = melanoma.l )
par(mfrow=c(1,2)) ## Two graphs in the same window
plot(sfit7a1, # Kaplan-Meier survival curves
     mark.time=F,
     xlab="Years since diagnosis",
     ylab="S(t)",
     col=c("blue","red"),
     lty=c("solid","dashed"))
plot(muhaz2(Surv(surv_mm/12, death_cancer) ~ year8594, data=melanoma.l),
     xlab="Years since diagnosis", col=c("blue","red"),lty=c("solid","dashed"))

day2_plot1.png

However, if we calculate average rates for the two periods, we see that the average rates across follow-up is higher in the later calendar period – which is inconsistent with the survival and hazard curves:

## average rates _over the observed follow-up_
survRate(Surv(surv_mm, death_cancer) ~ year8594, data=melanoma.l)
                                year8594    tstop event        rate       lower
year8594=Diagnosed 75-84 Diagnosed 75-84 271953.5   572 0.002103301 0.001934444
year8594=Diagnosed 85-94 Diagnosed 85-94 191565.5   441 0.002302085 0.002092214
                               upper
year8594=Diagnosed 75-84 0.002282950
year8594=Diagnosed 85-94 0.002527306

This is an example of confounding by time. If we restrict the analysis to the first ten years of follow-up:

melanoma.l2 <- mutate(melanoma.l,
                      ## Update the death indicator (only count deaths within 120 months)
                      death_cancer = ifelse(surv_mm<=120 & death_cancer,1,0),
                      ## Create a new time variable
                      surv_mm = ifelse(surv_mm<=120, surv_mm, 120)
                      )

## Calculate the rates on the truncated data
survRate(Surv(surv_mm, death_cancer) ~ year8594, data=melanoma.l2)
                                year8594    tstop event        rate       lower
year8594=Diagnosed 75-84 Diagnosed 75-84 198012.5   519 0.002621047 0.002400371
year8594=Diagnosed 85-94 Diagnosed 85-94 190507.5   441 0.002314869 0.002103833
                               upper
year8594=Diagnosed 75-84 0.002856555
year8594=Diagnosed 85-94 0.002541342

poisson7c <- glm( death_cancer ~ year8594 + offset( log( surv_mm/12) ), family=poisson, data=melanoma.l2 )
summary( poisson7c )

## IRR
eform(poisson7c)

Call:
glm(formula = death_cancer ~ year8594 + offset(log(surv_mm/12)), 
    family = poisson, data = melanoma.l2)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.7931  -0.7568  -0.5632  -0.3081   3.0550  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)             -3.45927    0.04389 -78.812   <2e-16 ***
year8594Diagnosed 85-94 -0.12422    0.06476  -1.918   0.0551 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 4814.3  on 5317  degrees of freedom
Residual deviance: 4810.6  on 5316  degrees of freedom
AIC: 6734.6

Number of Fisher Scoring iterations: 6
                         exp(beta)      2.5 %     97.5 %
(Intercept)             0.03145256 0.02885988 0.03427816
year8594Diagnosed 85-94 0.88318515 0.77790501 1.00271370

The regression model can be represented by:

$$ \begin{align*} E(\text{rate} | \text{year8594}) &= \exp(\beta_0 + \beta_1 I(\text{year8594}=\text{"Diagnosed 85-94"})) \end{align*} $$

where \(\beta_0\) is the intercept term, which is the log rate in the first calendar period, and \(beta_1\) is the log rate ratio comparing the second calendar period with the first calendar period.

Equivalently, we can include the person-time (pt) or an offset of the log person-time in the regression model for the expected counts:

\begin{align*} E(\text{count} | \text{year8594}, \text{pt}) &= \exp(\beta_0 + \beta_1 I(\text{year8594}=\text{"Diagnosed 85-94"}))\times \text{pt} \\ &= \exp(\beta_0 + \beta_1 I(\text{year8594}=\text{"Diagnosed 85-94"}) + \log(\text{pt})) \end{align*}
## Add a new variable for year
melanoma.l2 <- mutate( melanoma.l2, surv_yy1 = surv_mm/12)

## Split follow up by year
melanoma.spl <- survSplit(melanoma.l2, cut=0:9, end="surv_yy1", start="start",
                           event="death_cancer") |>
    ## Calculate persontime and
    ## recode start time as a factor
    mutate(pt = surv_yy1 - start,
           fu = as.factor(start) )

## @knitr 7.e

## Calculate the incidence rate by observation year
yearly_rates <- survRate(Surv(pt/1000,death_cancer)~fu, data=melanoma.spl)

par(mfrow=c(1,2))
with(yearly_rates, matplot(as.numeric(as.character(fu))+0.5,
                           cbind(rate, lower,
                                 upper),
                           ylim=c(0,max(upper)),
                           lty=c("solid","dashed","dashed"),
                           col=c("black","gray","gray"),
                           type="l",
                           ylab="Mortality rate per 1000 person-years",
                           xlab="Years since diagnosis") )
hazfit7f <- muhaz2(Surv(surv_mm/12, status == "Dead: cancer") ~ 1, data = melanoma.l)
## scale hazard by 1000
plot(hazfit7f, xlab="Years since diagnosis",
     ylab="Hazard per 1000 person-years",col="blue",lty="solid", haz.scale=1000, xlim=c(0,10))

day2_fig2.png

summary(poisson7h <- glm( death_cancer ~ fu + year8594 + offset( log(pt) ),
                         family = poisson,
                         data = melanoma.spl ))
## IRR
eform(poisson7h)

Call:
glm(formula = death_cancer ~ fu + year8594 + offset(log(pt)), 
    family = poisson, data = melanoma.spl)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.3297  -0.2900  -0.2207  -0.1761   3.6707  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)             -4.16612    0.12380 -33.651  < 2e-16 ***
fu1                      1.24352    0.13589   9.151  < 2e-16 ***
fu2                      1.25370    0.13797   9.087  < 2e-16 ***
fu3                      0.99738    0.14610   6.827 8.68e-12 ***
fu4                      0.79438    0.15532   5.115 3.14e-07 ***
fu5                      0.69247    0.16329   4.241 2.23e-05 ***
fu6                      0.45104    0.17911   2.518 0.011796 *  
fu7                      0.10844    0.20710   0.524 0.600559    
fu8                      0.21049    0.20954   1.004 0.315139    
fu9                     -0.39239    0.27780  -1.413 0.157802    
year8594Diagnosed 85-94 -0.24444    0.06579  -3.715 0.000203 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 8651.5  on 34308  degrees of freedom
Residual deviance: 8432.6  on 34298  degrees of freedom
AIC: 10375

Number of Fisher Scoring iterations: 6
                         exp(beta)      2.5 %     97.5 %
(Intercept)             0.01551228 0.01217011 0.01977229
fu1                     3.46780081 2.65693069 4.52614081
fu2                     3.50326901 2.67320128 4.59108479
fu3                     2.71116147 2.03608968 3.61005538
fu4                     2.21306295 1.63225462 3.00054142
fu5                     1.99864158 1.45125272 2.75249659
fu6                     1.56993640 1.10515372 2.23018778
fu7                     1.11453748 0.74268832 1.67256405
fu8                     1.23427730 0.81855780 1.86112752
fu9                     0.67543732 0.39185040 1.16425955
year8594Diagnosed 85-94 0.78314061 0.68839639 0.89092450

The regression model can be represented by:

\begin{align*} E(\text{rate} | \text{year8594}, \text{fu}) &= \exp(\beta_0 + \beta_1 I(\text{fu}=1) + ... \beta_9 I(\text{fu}=9) +\beta_{10} I(\text{year8594}=\text{"Diagnosed 85-94"})) \end{align*}

We could add an interaction with follow-up time and calendar period of diagnosis:

melanoma.spl2 <- mutate(melanoma.spl, interaction = (start>=5 & year8594=="Diagnosed 85-94"))
summary(poisson7h2 <- glm(death_cancer ~ fu + year8594 + interaction +
                              offset(log(pt)), family=poisson, data=melanoma.spl2))

Call:
glm(formula = death_cancer ~ fu + year8594 + interaction + offset(log(pt)), 
    family = poisson, data = melanoma.spl2)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.3302  -0.2898  -0.2221  -0.1765   3.6716  

Coefficients:
                        Estimate Std. Error z value Pr(>|z|)    
(Intercept)             -4.16253    0.12504 -33.289  < 2e-16 ***
fu1                      1.24346    0.13590   9.150  < 2e-16 ***
fu2                      1.25348    0.13798   9.085  < 2e-16 ***
fu3                      0.99695    0.14611   6.823 8.90e-12 ***
fu4                      0.79371    0.15535   5.109 3.24e-07 ***
fu5                      0.67839    0.17771   3.817 0.000135 ***
fu6                      0.43812    0.19032   2.302 0.021334 *  
fu7                      0.09704    0.21473   0.452 0.651322    
fu8                      0.20096    0.21483   0.935 0.349578    
fu9                     -0.39949    0.28002  -1.427 0.153683    
year8594Diagnosed 85-94 -0.25115    0.07372  -3.407 0.000658 ***
interactionTRUE          0.03284    0.16284   0.202 0.840175    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 8651.5  on 34308  degrees of freedom
Residual deviance: 8432.6  on 34297  degrees of freedom
AIC: 10377

Number of Fisher Scoring iterations: 6

The regression model can be represented by:

\begin{align*} E(\text{rate} | \text{year8594}, \text{fu}) &= \exp(\beta_0 + \beta_1 I(\text{fu}=1) + ... \beta_9 I(\text{fu}=9) +\beta_{10} I(\text{year8594}=\text{"Diagnosed 85-94"}) +\beta_{11} I(\text{year8594}=\text{"Diagnosed 85-94"} \&\ \text{start}\geq 5)) \end{align*}

To calculate the rate ratio for the second calendar period from five-years of follow-up, we can write:

\begin{align*} & \frac{E(\text{rate} | \text{year8594}=\text{"Diagnosed 85-94"}, \text{fu}>5)}{E(\text{rate} | \text{year8594}=\text{"Diagnosed 75-84"}, \text{fu}>5)} \\ &= \frac{\exp(\beta_0 + \beta_i I(\text{fu}=i) + \beta_{10} +\beta_{11})}{\exp(\beta_0 + \beta_i I(\text{fu}=i))} \\ &= \exp(\beta_{10} +\beta_{11}) \end{align*}

We can calculate the rate ratio and its 95% confidence interval using the lincom function:

library(car)
lincom(poisson7h2,c("year8594Diagnosed 85-94 + interactionTRUE"))
                                            Estimate      2.5 %     97.5 %
year8594Diagnosed 85-94 + interactionTRUE -0.2183106 -0.5028869 0.06626558
                                            Chisq Pr(>Chisq)
year8594Diagnosed 85-94 + interactionTRUE 2.26073  0.1326915

Alternatively, we could re-parameterise the model:

melanoma.spl2 <- mutate(melanoma.spl,
                        effect1 = (start<5 & I(year8594)=="Diagnosed 85-94"),
                        effect2 = (start>=5 & I(year8594)=="Diagnosed 85-94"))
summary(glm(death_cancer ~ fu + effect1 + effect2 +
            offset(log(pt)), family=poisson, data=melanoma.spl2))

Call:
glm(formula = death_cancer ~ fu + effect1 + effect2 + offset(log(pt)), 
    family = poisson, data = melanoma.spl2)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-0.3302  -0.2898  -0.2221  -0.1765   3.6716  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -4.16253    0.12504 -33.289  < 2e-16 ***
fu1          1.24346    0.13590   9.150  < 2e-16 ***
fu2          1.25348    0.13798   9.085  < 2e-16 ***
fu3          0.99695    0.14611   6.823 8.90e-12 ***
fu4          0.79371    0.15535   5.109 3.24e-07 ***
fu5          0.67839    0.17771   3.817 0.000135 ***
fu6          0.43812    0.19032   2.302 0.021334 *  
fu7          0.09704    0.21473   0.452 0.651322    
fu8          0.20096    0.21483   0.935 0.349578    
fu9         -0.39949    0.28002  -1.427 0.153683    
effect1TRUE -0.25115    0.07372  -3.407 0.000658 ***
effect2TRUE -0.21831    0.14519  -1.504 0.132691    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 8651.5  on 34308  degrees of freedom
Residual deviance: 8432.6  on 34297  degrees of freedom
AIC: 10377

Number of Fisher Scoring iterations: 6

We could also use a spline to model for time. Here we split more finely, fit a regression model, and plot the rates for the later calendar period:

library(splines)
time.cut <- seq(0,10,by=1/12)
melanoma.spl <- survSplit(Surv(surv_mm/12,status=="Dead: cancer")~.,
                          data=biostat3::melanoma,
                          cut=time.cut,
                          subset=stage=="Localised") |>
    mutate(mid=(tstop+tstart)/2, risk_time=tstop-tstart)
poisson7m <- glm(event ~ ns(mid,df=4) + year8594 +
                     offset(log(risk_time)),
                 family=poisson,
                 data=melanoma.spl)
df <- data.frame(year8594="Diagnosed 85-94",
                 mid=time.cut[-1], risk_time=1000)
## plot the rate at the baseline values
plot(df$mid, predict(poisson7m, newdata=df, type="response"),
     type="l", ylab="Rate per 1000 person-years",
     xlab="Time since diagnosis (years)",
     ylim=c(0,50))

day2_fig3.png

Author: Mark Clements

Created: 2022-11-10 Thu 17:11

Validate