Load the diet data using time-on-study as the timescale.
You may have to install the required packages the first time you use them. You can install a package by install.packages(“package_of_interest”) for each package you require.
library(biostat3)
library(dplyr) ## for data manipulation
library(car) ## for car::linearHypothesis -> biostat3::lincom
Load the melanoma data and explore it.
# Read melanoma data
# Create a new dataset with only localised cancer
# We keep the variables we need for the proceeding analysis
melanoma.l <- filter(biostat3::melanoma,
stage=="Localised") %>%
select(id, sex, agegrp, year8594, surv_mm, status)
head(melanoma.l)
## id sex agegrp year8594 surv_mm status
## 1 1 Female 75+ Diagnosed 75-84 26.5 Dead: other
## 2 2 Female 75+ Diagnosed 75-84 55.5 Dead: other
## 3 3 Female 75+ Diagnosed 75-84 177.5 Dead: other
## 4 6 Female 75+ Diagnosed 75-84 19.5 Dead: cancer
## 5 7 Female 75+ Diagnosed 75-84 65.5 Dead: other
## 6 8 Female 75+ Diagnosed 75-84 50.5 Dead: other
summary(melanoma.l)
## id sex agegrp year8594
## Min. : 1 Male :2405 0-44 :1463 Diagnosed 75-84:2145
## 1st Qu.:1876 Female:2913 45-59:1575 Diagnosed 85-94:3173
## Median :3756 60-74:1536
## Mean :3807 75+ : 744
## 3rd Qu.:5734
## Max. :7774
## surv_mm status
## Min. : 0.50 Alive :3517
## 1st Qu.: 35.50 Dead: cancer :1013
## Median : 73.50 Dead: other : 782
## Mean : 87.16 Lost to follow-up: 6
## 3rd Qu.:127.50
## Max. :251.50
Plot Kaplan-Meier estimates of cause-specific survival as a function of calendar period of diagnosis.
# 1)Create a new event indicator
melanoma.l <- mutate(melanoma.l,
death_cancer = as.numeric(status=="Dead: cancer"))
# 2) Create a fitted object for our subcohort using survfit
sfit7ai <- survfit(Surv(surv_mm, event = death_cancer) ~ year8594,
data = melanoma.l)
# 3) If you want to obtain values for specific time-points, such as 1-, 3-,
# 5-, 10-year cause-specific survival
summary(sfit7ai, times = c(1, 3, 5, 10)*12)
## Call: survfit(formula = Surv(surv_mm, event = death_cancer) ~ year8594,
## data = melanoma.l)
##
## year8594=Diagnosed 75-84
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 12 2081 33 0.985 0.00268 0.979 0.990
## 36 1774 228 0.875 0.00727 0.860 0.889
## 60 1600 110 0.819 0.00850 0.803 0.836
## 120 1302 148 0.740 0.00986 0.721 0.760
##
## year8594=Diagnosed 85-94
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 12 3085 38 0.988 0.00195 0.984 0.992
## 36 2181 202 0.915 0.00526 0.905 0.926
## 60 1461 128 0.854 0.00719 0.840 0.868
## 120 162 73 0.790 0.01084 0.769 0.812
# 4) Plot the cause-speficic survival curves estimated with KM
# Useful to show the plot with risk table at the bottom
# Easier to use the package survminer with ggsurvplot function
# install.packages("survminer")
library(survminer)
plot_sfit7ai <- ggsurvplot(fit = sfit7ai,
conf.int = TRUE,
censor = FALSE, ## do not show censored observations
break.time.by = 12*2, xscale = "m_y",
break.y.by = 0.1, surv.scale = "percent",
title = "Kaplan-Meier estimated cause-specific survival curves",
xlab = "Years since diagnosis", ylab = "S(t)",
palette = c("#E69F00", "#56B4E9"),
legend = "top", legend.title = "",
legend.labs = c("Diagnosed 75-84", "Diagnosed 85-94"),
risk.table = TRUE, fontsize = 3, tables.y.text = FALSE,
ggtheme = theme_minimal() + theme(panel.grid.minor = element_blank())
)
plot_sfit7ai
The cause-specific survival is higher for patients diagnosed in the recent period (1985-1994).
Now plot the estimated hazard function (cause-specific mortality rate) as a function of calendar period of diagnosis. During which calendar period (the early or the latter) is mortality the lowest?
haz_fit7aii <- muhaz2(Surv(surv_mm/12, status == "Dead: cancer") ~ year8594, data = melanoma.l)
df <- as.data.frame(haz_fit7aii)
# to get a fancy plot we can use ggplot
# install.packages("ggplot2")
library(ggplot2)
plot_haz_fit7aii <- ggplot(aes(x = x, y = y), data = df) +
geom_line(aes(colour = year8594), linewidth = 1) +
labs(title = "Kernel smoothed hazards (cause-speficic mortality rates)",
x = "Years since diagnosis",
y = "Hazard") +
scale_color_manual(name = "", values = c("#E69F00", "#56B4E9")) +
scale_x_continuous(breaks = 0:20) +
theme_minimal() +
theme(legend.text = element_text(size = 10),
legend.position = c(0.7, 0.9),
panel.grid.minor = element_blank())
plot_haz_fit7aii
As expected, the cause-specific mortality is lower for patients diagnosed in the latter period.
Is the interpretation (with respect to how prognosis depends on period) based on the hazard consistent with the interpretation of the survival plot?
## Compare with Kaplan-Meier plot
ggarrange(plot_sfit7ai$plot +
labs(title = "Kaplan-Meier cause-specific\nsurvival curves") +
theme(legend.position="none"),
plot_haz_fit7aii +
scale_x_continuous(breaks = seq(0, 20, by = 2)) +
labs(title = "Kernel smoothed hazards\n(cause-speficic mortality rates)"))
The two graphs both show that prognosis is better for patients diagnosed during the latter period as they have lower cause-specific mortality and higher cause-specific survival.
Estimate the cause-specific mortality rate for each calendar period. During which calendar period (the early or the latter) is mortality the lowest? Is this consistent with what you found earlier? If not, why the inconsistency?
rates_year8594 <- survRate(Surv(surv_mm, death_cancer) ~ year8594, data = melanoma.l)
print.data.frame(rates_year8594, row.names = F)
## year8594 tstop event rate lower upper
## Diagnosed 75-84 271953.5 572 0.002103301 0.001934444 0.002282950
## Diagnosed 85-94 191565.5 441 0.002302085 0.002092214 0.002527306
The estimated cause-specific mortality rate is higher for patients diagnosed in the latter period. This is not consistent with what we saw earlier, as we have not controlled for time since diagnosis, which confounds the effect of exposure. Without controlling for the follow-up time, we see that the overall average hazard for patients diagnosed in the early period is drawn down by the low mortality experienced by patients 10 years subsequent to diagnosis.
Two approaches for controlling for confounding are ‘restriction’ and ‘statistical adjustment’. We will first use restriction to control for confounding. We will restrict the potential follow-up time to a maximum of 120 months. Individuals who survive more than 120 months are censored at 120 months.
Estimate the cause-specific mortality rate for each calendar period. During which calendar period (the early of the latter) is mortality the lowest? Is this consistent with what you found in part 7a?
## Calculate the incidence rate by time of diagnosis
## but with new variables
melanoma.l2 <- mutate(melanoma.l,
## Update the death indicator (only count deaths within 120 months)
death_cancer = ifelse(surv_mm<=120 & status == "Dead: cancer", 1, 0),
## Create a new time variable
surv_mm = ifelse(surv_mm<=120, surv_mm, 120)
)
## Calculate the rates on the truncated data
rates_by_diag_yr2 <- survRate(Surv(surv_mm, death_cancer) ~ year8594, data = melanoma.l2)
print.data.frame(rates_by_diag_yr2, row.names = F)
## year8594 tstop event rate lower upper
## Diagnosed 75-84 198012.5 519 0.002621047 0.002400371 0.002856555
## Diagnosed 85-94 190507.5 441 0.002314869 0.002103833 0.002541342
After restricting everyone’s follow-up time to be maximum 120 months (10 years), we observe that the average cause-specific mortality rates are lower for patients diagnosed in the latter period. This is consistent with 7a.
Calculate by hand the ratio (85–94/75–84) of the two mortality rates (i.e., a mortality rate ratio) and interpret the estimate (i.e., during which period is mortality higher/lower and by how much).
# calculate MRR and confidence intervals
mutate(rates_by_diag_yr2,
MRR = c(1, rate[2]/rate[1]),
CI_Low = c(NA, poisson.test(event[c(2, 1)], tstop[c(2, 1)])$conf.int[1]),
CI_Up = c(NA, poisson.test(event[c(2, 1)], tstop[c(2, 1)])$conf.int[2]))
## year8594 tstop event rate lower
## year8594=Diagnosed 75-84 Diagnosed 75-84 198012.5 519 0.002621047 0.002400371
## year8594=Diagnosed 85-94 Diagnosed 85-94 190507.5 441 0.002314869 0.002103833
## upper MRR CI_Low CI_Up
## year8594=Diagnosed 75-84 0.002856555 1.0000000 NA NA
## year8594=Diagnosed 85-94 0.002541342 0.8831852 0.7761294 1.004662
The crude estimated cause-specific mortality rate ratio is 0.88 with 95% CI (0.78, 1), which means that the cause-specific mortality for patients diagnosed during the latter period is approximately 12% lower relative to patients diagnosed during the early period.
Now use Poisson regression to estimate the same mortality rate ratio.
## Use glm to estimate the rate ratios
## we scale the offset term to 1000 person-years
poisson7c <- glm(death_cancer ~ year8594 + offset(log(surv_mm/12/1000)), family = poisson, data = melanoma.l2)
summary(poisson7c)
##
## Call:
## glm(formula = death_cancer ~ year8594 + offset(log(surv_mm/12/1000)),
## family = poisson, data = melanoma.l2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.44848 0.04389 78.566 <2e-16 ***
## year8594Diagnosed 85-94 -0.12422 0.06476 -1.918 0.0551 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 4814.3 on 5317 degrees of freedom
## Residual deviance: 4810.6 on 5316 degrees of freedom
## AIC: 6734.6
##
## Number of Fisher Scoring iterations: 6
## MRR
eform(poisson7c)
## exp(beta) 2.5 % 97.5 %
## (Intercept) 31.4525600 28.859880 34.278158
## year8594Diagnosed 85-94 0.8831851 0.777905 1.002714
## Note that the scaling of the offset term only has an impact on the intercept
eform(glm(death_cancer ~ year8594 + offset(log(surv_mm)),
family = poisson, data = melanoma.l2))
## exp(beta) 2.5 % 97.5 %
## (Intercept) 0.002621047 0.00240499 0.002856513
## year8594Diagnosed 85-94 0.883185150 0.77790501 1.002713701
The estimated crude MRR from the Poisson model is identical to the result in part (ii). The following is the equation for our model:
where is the expected number of deaths due to cancer. Recall that we have restricted the follow-up for all patients to have a maximum of 10 years, but we have not adjusted our model for follow-up time yet. So the produced MRR is only a crude estimate and doesn’t take into account the dependency on time since diagnosis.
In order to adjust for time since diagnosis (i.e., adjust for the fact that we expect mortality to depend on time since diagnosis) we need to split the data by this timescale. We continue with the restricted time up to 10 years following diagnosis.
## Add a new variable for year
melanoma.l2 <- mutate(melanoma.l2,
surv_yy1 = surv_mm/12)
# Let's take a look at patients with ID 1 and 14
melanoma.l2 %>%
filter(id %in% c(1, 14))
## id sex agegrp year8594 surv_mm status death_cancer surv_yy1
## 1 1 Female 75+ Diagnosed 75-84 26.5 Dead: other 0 2.208333
## 2 14 Female 75+ Diagnosed 75-84 29.5 Dead: cancer 1 2.458333
## Split follow up by year
melanoma.l2.spl <- survSplit(melanoma.l2, cut = 1:10, end = "surv_yy1", start = "start",
event = "death_cancer", episode = "fu")
# Look at patients with ID 1 and 14
melanoma.l2.spl %>%
filter(id %in% c(1, 14)) %>%
select(id, year8594, start, surv_yy1, fu, death_cancer)
## id year8594 start surv_yy1 fu death_cancer
## 1 1 Diagnosed 75-84 0 1.000000 1 0
## 2 1 Diagnosed 75-84 1 2.000000 2 0
## 3 1 Diagnosed 75-84 2 2.208333 3 0
## 4 14 Diagnosed 75-84 0 1.000000 1 0
## 5 14 Diagnosed 75-84 1 2.000000 2 0
## 6 14 Diagnosed 75-84 2 2.458333 3 1
## Calculate persontime and recode fu as a factor
melanoma.l2.spl <- mutate(melanoma.l2.spl,
pt = surv_yy1 - start,
fu = as.factor(fu))
Now tabulate (and produce a graph of) the rates by follow-up time. Mortality appears to be quite low during the first year of follow-up. Does this seem reasonable considering the disease with which these patients have been diagnosed?
## Calculate the incidence rate by observation year
yearly_rates <- survRate(Surv(pt/1000, death_cancer) ~ fu, data = melanoma.l2.spl)
print.data.frame(yearly_rates, row.names = F)
## fu tstop event rate lower upper
## 1 5.257000 71 13.50580 10.548143 17.03573
## 2 4.857917 228 46.93370 41.038873 53.43754
## 3 4.235458 202 47.69260 41.342062 54.74259
## 4 3.711583 138 37.18090 31.236510 43.92727
## 5 3.265583 100 30.62240 24.915607 37.24504
## 6 2.864667 80 27.92646 22.143946 34.75690
## 7 2.524792 56 22.18005 16.754570 28.80264
## 8 2.190250 35 15.97991 11.130593 22.22419
## 9 1.886375 34 18.02399 12.482136 25.18672
## 10 1.583042 16 10.10712 5.777095 16.41334
## Plot rates by year
# library(ggplot2)
plot_yearly_rates <- ggplot(aes(x = as.numeric(fu), y = rate), data = yearly_rates) +
geom_errorbar(aes(ymin = lower, ymax = upper), colour = "grey60", linewidth = 1, width = 0.2) +
geom_point(size = 2) +
scale_x_continuous(name = "Years since diagnosis",
breaks = 1:10) +
labs(title = "Cancer deaths by years since diagnosis",
y = "Mortality rate per 1000 person-years") +
theme_minimal() +
theme(panel.grid.minor.x = element_blank(),
axis.text = element_text(size = 12))
plot_yearly_rates
Recall that the selected patients were diagnosed with localised skin melanoma at the start. It seems reasonable that in the first year the mortality due to cancer is lower. However, for many the disease has progressed to metastases, and therefore, the cancer-specific mortality has peaked after the first year.
Compare the plot of the estimated rates to a plot of the hazard rate as a function of continuous time. Is the interpretation similar? Do you think it is sufficient to classify follow-up time into annual intervals or might it be preferable to use, for example, narrower intervals?
# kernel-smoothed hazards scaled by 1000
hazfit7f <- muhaz2(Surv(surv_mm/12/1000, status=="Dead: cancer") ~ 1, data = melanoma.l)
plot_hazfit7f <- ggplot(aes(x = x*1000, y = y), data = as.data.frame(hazfit7f)) +
geom_line(linewidth = 1) +
scale_x_continuous(name = "Years since diagnosis",
breaks = 1:10, limits = c(0, 10)) +
labs(title = "Continuous follow-up time", y = "") +
theme_minimal() +
theme(panel.grid.minor.x = element_blank(),
axis.text = element_text(size = 12))
ggarrange(plot_yearly_rates + labs(title = "Categorised follow-up time"),
plot_hazfit7f)
The shape of the estimated mortality rates on the left panel resembles that on the right panel. By splitting the time-scale into 1-year time-intervals, we aimed to approximate the functional form depicted on the right panel. The narrower the time-intervals are, the closer the shape will be to the form in the right panel but that can come at a cost of fewer events in the intervals, which may lead to larger confidence intervals.
Use Poisson regression to estimate rates as a function of follow-up time. Does the pattern of estimated rates mirror the pattern you observed in the plots? Write the linear predictor using pen and paper.
## Run Poisson regression without including intercept, then we can estimate log rates for each year
summary(poisson7g <- glm(death_cancer ~ fu + offset(log(pt/1000)) - 1,
family = poisson, data = melanoma.l2.spl))
##
## Call:
## glm(formula = death_cancer ~ fu + offset(log(pt/1000)) - 1, family = poisson,
## data = melanoma.l2.spl)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## fu1 2.60312 0.11866 21.937 <2e-16 ***
## fu2 3.84874 0.06623 58.115 <2e-16 ***
## fu3 3.86478 0.07036 54.930 <2e-16 ***
## fu4 3.61580 0.08512 42.477 <2e-16 ***
## fu5 3.42173 0.10000 34.218 <2e-16 ***
## fu6 3.32957 0.11180 29.782 <2e-16 ***
## fu7 3.09919 0.13362 23.194 <2e-16 ***
## fu8 2.77133 0.16900 16.398 <2e-16 ***
## fu9 2.89170 0.17148 16.863 <2e-16 ***
## fu10 2.31324 0.24978 9.261 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 13304.0 on 34309 degrees of freedom
## Residual deviance: 8446.4 on 34299 degrees of freedom
## AIC: 10386
##
## Number of Fisher Scoring iterations: 6
## rates
eform(poisson7g)
## exp(beta) 2.5 % 97.5 %
## fu1 13.50580 10.70318 17.04229
## fu2 46.93370 41.22048 53.43877
## fu3 47.69260 41.54909 54.74448
## fu4 37.18090 31.46747 43.93169
## fu5 30.62240 25.17221 37.25264
## fu6 27.92646 22.43119 34.76797
## fu7 22.18005 17.06955 28.82059
## fu8 15.97991 11.47416 22.25502
## fu9 18.02399 12.87902 25.22430
## fu10 10.10714 6.19462 16.49081
As above, we see identical values in the Poisson-estimated rates compared to part (e). The following is the equation for our model:
For example, the estimated death rates per 1000 person-years for the 2nd year of follow-up is with 95% CI (41.22, 53.44) (see output from eform(poisson7g)).
Now estimate the effect of calendar period of diagnosis while adjusting for time since diagnosis. Before fitting this model, predict what you expect the estimated effect to be (i.e., will it be higher, lower, or similar to the value we obtained in part c). Write the linear predictor using pen and paper and draw a graph of the fitted hazard rates.
Is the estimated effect of calendar period of diagnosis consistent with what you expected? Add an interaction between follow-up and calendar period of diagnosis and interpret the results.
summary(poisson7h <- glm(death_cancer ~ fu + year8594 + offset(log(pt/1000)) - 1,
family = poisson, data = melanoma.l2.spl))
##
## Call:
## glm(formula = death_cancer ~ fu + year8594 + offset(log(pt/1000)) -
## 1, family = poisson, data = melanoma.l2.spl)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## fu1 2.74163 0.12380 22.145 < 2e-16 ***
## fu2 3.98515 0.07478 53.293 < 2e-16 ***
## fu3 3.99533 0.07777 51.373 < 2e-16 ***
## fu4 3.73901 0.09065 41.248 < 2e-16 ***
## fu5 3.53601 0.10405 33.983 < 2e-16 ***
## fu6 3.43410 0.11482 29.908 < 2e-16 ***
## fu7 3.19267 0.13564 23.539 < 2e-16 ***
## fu8 2.85007 0.17012 16.754 < 2e-16 ***
## fu9 2.95212 0.17212 17.151 < 2e-16 ***
## fu10 2.34924 0.24992 9.400 < 2e-16 ***
## year8594Diagnosed 85-94 -0.24444 0.06579 -3.715 0.000203 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 13304.0 on 34309 degrees of freedom
## Residual deviance: 8432.6 on 34298 degrees of freedom
## AIC: 10375
##
## Number of Fisher Scoring iterations: 6
## MRR
eform(poisson7h)
## exp(beta) 2.5 % 97.5 %
## fu1 15.5122794 12.1701059 19.7722857
## fu2 53.7934949 46.4598966 62.2846864
## fu3 54.3436875 46.6606643 63.2917772
## fu4 42.0562941 35.2105415 50.2330212
## fu5 34.3296506 27.9962837 42.0957626
## fu6 31.0034865 24.7555832 38.8282581
## fu7 24.3532921 18.6682887 31.7695341
## fu8 17.2890167 12.3870353 24.1308829
## fu9 19.1464542 13.6640198 26.8286138
## fu10 10.4775724 6.4198751 17.0999468
## year8594Diagnosed 85-94 0.7831406 0.6883964 0.8909245
Recall from part (c), the estimated MRR for patients diagnosed in the latter period relative to the early period without adjusting for follow-up time was 0.8832. Now we see that by adjusting for follow-up time, the estimated MRR reduced further to 0.7831, thus showing that the mortality rate for patients diagnosed in the latter period was approximately 22% lower relative to patients diagnosed in the early period. However, we assume here that this number stays the same over the full 10-year period (proportional hazards assumption). We need to check whether we can allow for non-proportional hazards.
# Add interaction term
poisson7h2 <- glm(death_cancer ~ fu*year8594 + offset(log(pt)),
family = poisson, data = melanoma.l2.spl)
## MRR
eform(poisson7h2)
## exp(beta) 2.5 % 97.5 %
## (Intercept) 0.01555564 0.01105892 0.0218808
## fu2 3.59765631 2.44018328 5.3041634
## fu3 4.04160662 2.74548881 5.9496087
## fu4 2.26998933 1.48622497 3.4670737
## fu5 1.92140436 1.23576375 2.9874599
## fu6 1.94474961 1.24608035 3.0351582
## fu7 1.51429324 0.94118270 2.4363857
## fu8 1.03722422 0.60906442 1.7663716
## fu9 1.35444106 0.82243832 2.2305753
## fu10 0.67926283 0.36353299 1.2692053
## year8594Diagnosed 85-94 0.77907144 0.48869327 1.2419903
## fu2:year8594Diagnosed 85-94 0.93136483 0.54614148 1.5883072
## fu3:year8594Diagnosed 85-94 0.73453606 0.42659693 1.2647612
## fu4:year8594Diagnosed 85-94 1.41072301 0.79400185 2.5064670
## fu5:year8594Diagnosed 85-94 1.34685580 0.73234442 2.4770047
## fu6:year8594Diagnosed 85-94 1.06799324 0.56050340 2.0349735
## fu7:year8594Diagnosed 85-94 1.10140218 0.53919080 2.2498284
## fu8:year8594Diagnosed 85-94 1.24831405 0.53921836 2.8899016
## fu9:year8594Diagnosed 85-94 0.59732398 0.20746996 1.7197475
## fu10:year8594Diagnosed 85-94 0.94138428 0.19917538 4.4493671
After including the interaction term, we have cancer-specific MRR in the first year as 0.779, which is not substantially different from the model with proportional hazards above (no interaction with follow-up time).
Now control for age, sex, and calendar period. Write the linear predictor using pen and paper.
The following is the equation for our model:
summary(poisson7i <- glm(death_cancer ~ fu + agegrp + sex + year8594 + offset(log(pt)),
family = poisson, data = melanoma.l2.spl))
##
## Call:
## glm(formula = death_cancer ~ fu + agegrp + sex + year8594 + offset(log(pt)),
## family = poisson, data = melanoma.l2.spl)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.36681 0.14322 -30.490 < 2e-16 ***
## fu2 1.26827 0.13592 9.331 < 2e-16 ***
## fu3 1.30657 0.13806 9.464 < 2e-16 ***
## fu4 1.07575 0.14627 7.354 1.92e-13 ***
## fu5 0.89517 0.15559 5.753 8.75e-09 ***
## fu6 0.81370 0.16368 4.971 6.65e-07 ***
## fu7 0.58637 0.17957 3.265 0.00109 **
## fu8 0.25361 0.20758 1.222 0.22181
## fu9 0.36427 0.21006 1.734 0.08290 .
## fu10 -0.22796 0.27844 -0.819 0.41296
## agegrp45-59 0.28352 0.09417 3.011 0.00261 **
## agegrp60-74 0.62185 0.09088 6.843 7.76e-12 ***
## agegrp75+ 1.22386 0.10444 11.718 < 2e-16 ***
## sexFemale -0.53180 0.06545 -8.125 4.48e-16 ***
## year8594Diagnosed 85-94 -0.32516 0.06618 -4.913 8.97e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 8651.5 on 34308 degrees of freedom
## Residual deviance: 8233.4 on 34294 degrees of freedom
## AIC: 10183
##
## Number of Fisher Scoring iterations: 7
eform(poisson7i)
## exp(beta) 2.5 % 97.5 %
## (Intercept) 0.01269168 0.009585413 0.01680457
## fu2 3.55468470 2.723340923 4.63980959
## fu3 3.69349752 2.817870250 4.84121792
## fu4 2.93219656 2.201336558 3.90570748
## fu5 2.44775331 1.804376456 3.32053559
## fu6 2.25623262 1.637030417 3.10964634
## fu7 1.79745329 1.264170006 2.55569926
## fu8 1.28866663 0.857919555 1.93568462
## fu9 1.43945962 0.953660962 2.17272602
## fu10 0.79615726 0.461304916 1.37407245
## agegrp45-59 1.32779475 1.104004888 1.59694845
## agegrp60-74 1.86237635 1.558526802 2.22546423
## agegrp75+ 3.40028687 2.770846371 4.17271449
## sexFemale 0.58754651 0.516807578 0.66796796
## year8594Diagnosed 85-94 0.72241051 0.634523266 0.82247095
Interpret the estimated mortality rate ratio for the parameter labelled agegrp60-74, including a comment on statistical significance.
The estimated cause-specific MRR for agegrp60-74 is 1.8624, meaning that for patients of the same sex diagnosed in the same calendar period, the mortality rate due to skin melanoma for those diagnosed at 60-74 is approximately 86% higher than for patients diagnosed at ages 0-44 (the reference group), and it is statistically significant.
Is the effect of calendar period strongly confounded by age and sex? That is, does the inclusion of sex and age in the model change the estimate for the effect of calendar period?
The parameter estimate for period changes from 0.78 to 0.72 when age and sex are added to the model. Whether this is “strong confounding”, or even “confounding” is a matter of judgement, and it depends on the research question at hand.
Perform a Wald test of the overall effect of age and interpret the results.
## Test if the effect of age is significant using a likelihood ratio test
drop1(poisson7i, ~agegrp, test = "Chisq")
## Single term deletions
##
## Model:
## death_cancer ~ fu + agegrp + sex + year8594 + offset(log(pt))
## Df Deviance AIC LRT Pr(>Chi)
## <none> 8233.4 10183
## agegrp 3 8377.9 10322 144.59 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## For this we can also use the car package and a Wald test
car::linearHypothesis(poisson7i,c("agegrp45-59 = 0","agegrp60-74 = 0","agegrp75+ = 0"))
## Linear hypothesis test
##
## Hypothesis:
## agegrp45-59 = 0
## agegrp60-74 = 0
## agegrp75+ = 0
##
## Model 1: restricted model
## Model 2: death_cancer ~ fu + agegrp + sex + year8594 + offset(log(pt))
##
## Res.Df Df Chisq Pr(>Chisq)
## 1 34297
## 2 34294 3 155.82 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## ADVANCED:
## Alternative approach for the likelihood ratio test
# poisson7i_2 <- update(poisson7i,. ~ . - agegrp)
# anova(poisson7i_2, poisson7i, test = "Chisq")
Age (modelled as a categorical variable with 4 levels) is highly significant in the model.
Is the effect of sex modified by calendar period (whilst adjusting for age and follow-up)? Fit an appropriate interaction term to test this hypothesis. Write the linear predictor using pen and paper.
summary(poisson7j <- glm(death_cancer ~ fu + agegrp + year8594*sex + offset(log(pt)),
family = poisson, data = melanoma.l2.spl))
##
## Call:
## glm(formula = death_cancer ~ fu + agegrp + year8594 * sex + offset(log(pt)),
## family = poisson, data = melanoma.l2.spl)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.37900 0.14596 -30.001 < 2e-16 ***
## fu2 1.26830 0.13592 9.331 < 2e-16 ***
## fu3 1.30659 0.13806 9.464 < 2e-16 ***
## fu4 1.07569 0.14627 7.354 1.92e-13 ***
## fu5 0.89511 0.15559 5.753 8.77e-09 ***
## fu6 0.81360 0.16369 4.971 6.68e-07 ***
## fu7 0.58630 0.17958 3.265 0.001095 **
## fu8 0.25340 0.20759 1.221 0.222197
## fu9 0.36405 0.21007 1.733 0.083090 .
## fu10 -0.22829 0.27845 -0.820 0.412297
## agegrp45-59 0.28270 0.09419 3.001 0.002688 **
## agegrp60-74 0.62118 0.09089 6.835 8.23e-12 ***
## agegrp75+ 1.22364 0.10444 11.717 < 2e-16 ***
## year8594Diagnosed 85-94 -0.29917 0.08840 -3.384 0.000714 ***
## sexFemale -0.50562 0.08813 -5.737 9.64e-09 ***
## year8594Diagnosed 85-94:sexFemale -0.05792 0.13061 -0.443 0.657440
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 8651.5 on 34308 degrees of freedom
## Residual deviance: 8233.2 on 34293 degrees of freedom
## AIC: 10185
##
## Number of Fisher Scoring iterations: 7
eform(poisson7j)
## exp(beta) 2.5 % 97.5 %
## (Intercept) 0.01253791 0.009418543 0.01669039
## fu2 3.55479530 2.723425214 4.63995469
## fu3 3.69354707 2.817905687 4.84128693
## fu4 2.93201254 2.201195141 3.90546816
## fu5 2.44760423 1.804262241 3.32034133
## fu6 2.25601958 1.636868208 3.10936723
## fu7 1.79732528 1.264071466 2.55553445
## fu8 1.28840069 0.857735550 1.93530086
## fu9 1.43915154 0.953447823 2.17228159
## fu10 0.79589580 0.461149435 1.37363308
## agegrp45-59 1.32670920 1.103059120 1.59570531
## agegrp60-74 1.86113111 1.557443264 2.22403543
## agegrp75+ 3.39953913 2.770276830 4.17173697
## year8594Diagnosed 85-94 0.74143513 0.623488755 0.88169360
## sexFemale 0.60313385 0.507452579 0.71685602
## year8594Diagnosed 85-94:sexFemale 0.94372451 0.730577239 1.21905789
The interaction term is not statistically significant indicating that there is no evidence that the effect of sex is modified by period. The following is the equation for our model:
Based on the interaction model you fitted in exercise 7j, estimate the hazard ratio for the effect of sex (with 95% confidence interval) for each calendar period.
Using hand-calculation on the estimates from exercise 7j.
# hand calculations
hz7k <- exp(coef(poisson7j))
hz7k["sexFemale"]
## sexFemale
## 0.6031338
hz7k["sexFemale"]*hz7k["year8594Diagnosed 85-94:sexFemale"]
## sexFemale
## 0.5691922
For the same year of follow-up and the same age-group, the effect of sex for patients diagnosed in 1975–84 is =0.6031 and the effect of sex for patients diagnosed in 1985–94 is = 0.6031×0.9437=0.5692.
Using car::lincom()
function.
## You will need the "car" package to use lincom. If it is not already installed:
## install.packages("car")
lincom(poisson7j, c("sexFemale + year8594Diagnosed 85-94:sexFemale"), eform = TRUE) %>%
print.data.frame(row.names = FALSE)
## Estimate 2.5 % 97.5 % Chisq Pr(>Chisq)
## 0.5691922 0.4705541 0.6885069 33.68456 6.481293e-09
The advantage of lincom
is that we also get a confidence
interval (not easy to calculate by hand since the SE is a function of
variances and covariances).
Creating appropriate dummy variables that represent the effects of sex for each calendar period.
## Create dummies and Poisson regression
melanoma.l2.spl <- melanoma.l2.spl %>%
mutate(femaleEarly = as.numeric(sex=="Female" & year8594=="Diagnosed 75-84"),
femaleLate = as.numeric(sex=="Female" & year8594=="Diagnosed 85-94"))
poisson7k <- glm(death_cancer ~ fu + agegrp + year8594 + femaleEarly +
femaleLate + offset(log(pt)), family = poisson,
data = melanoma.l2.spl)
eform(poisson7k)
## exp(beta) 2.5 % 97.5 %
## (Intercept) 0.01253791 0.009418543 0.01669039
## fu2 3.55479530 2.723425214 4.63995469
## fu3 3.69354707 2.817905687 4.84128693
## fu4 2.93201254 2.201195141 3.90546816
## fu5 2.44760423 1.804262241 3.32034133
## fu6 2.25601958 1.636868208 3.10936723
## fu7 1.79732528 1.264071466 2.55553445
## fu8 1.28840069 0.857735550 1.93530086
## fu9 1.43915154 0.953447823 2.17228159
## fu10 0.79589580 0.461149435 1.37363308
## agegrp45-59 1.32670920 1.103059120 1.59570531
## agegrp60-74 1.86113111 1.557443264 2.22403543
## agegrp75+ 3.39953913 2.770276830 4.17173697
## year8594Diagnosed 85-94 0.74143513 0.623488755 0.88169360
## femaleEarly 0.60313385 0.507452579 0.71685602
## femaleLate 0.56919219 0.470554120 0.68850689
Using the formula to specify the interactions to repeat the previous model.
## Add interaction term
poisson7k2 <- glm(death_cancer ~ fu + agegrp + year8594 + year8594:sex +
offset(log(pt)), family = poisson,
data = melanoma.l2.spl)
eform(poisson7k2)
## exp(beta) 2.5 % 97.5 %
## (Intercept) 0.01253791 0.009418543 0.01669039
## fu2 3.55479530 2.723425214 4.63995469
## fu3 3.69354707 2.817905687 4.84128693
## fu4 2.93201254 2.201195141 3.90546816
## fu5 2.44760423 1.804262241 3.32034133
## fu6 2.25601958 1.636868208 3.10936723
## fu7 1.79732528 1.264071466 2.55553445
## fu8 1.28840069 0.857735550 1.93530086
## fu9 1.43915154 0.953447823 2.17228159
## fu10 0.79589580 0.461149435 1.37363308
## agegrp45-59 1.32670920 1.103059120 1.59570531
## agegrp60-74 1.86113111 1.557443264 2.22403543
## agegrp75+ 3.39953913 2.770276830 4.17173697
## year8594Diagnosed 85-94 0.74143513 0.623488755 0.88169360
## year8594Diagnosed 75-84:sexFemale 0.60313385 0.507452579 0.71685602
## year8594Diagnosed 85-94:sexFemale 0.56919219 0.470554120 0.68850689
Now fit a separate model for each calendar period in order to estimate the hazard ratio for the effect of sex (with 95% confidence interval) for each calendar period.
Why do the estimates differ from those you obtained in the previous part?
poisson7l.early <- glm(death_cancer ~ fu + agegrp + sex + offset(log(pt)),
family = poisson, data = melanoma.l2.spl,
subset = year8594 == "Diagnosed 75-84")
poisson7l.late <- glm(death_cancer ~ fu + agegrp + sex + offset(log(pt)),
family = poisson, data = melanoma.l2.spl,
subset = year8594 == "Diagnosed 85-94")
# parameters for poisson7l.early
est_7l_early <- cbind(Params = rownames(eform(poisson7l.early)),
data.frame(est_7l_early = eform(poisson7l.early)[, 1], row.names = NULL))
# parameters for poisson7l.early
est_7l_late <- cbind(Params = rownames(eform(poisson7l.late)),
data.frame(est_7l_late = eform(poisson7l.late)[, 1], row.names = NULL))
# parameters from part 7i
est_7i <- cbind(Params = rownames(eform(poisson7i)),
data.frame(est_7i = eform(poisson7i)[, 1], row.names = NULL))
# parameters from part 7j
est_7j <- cbind(Params = rownames(eform(poisson7j)),
data.frame(est_7j = eform(poisson7j)[, 1], row.names = NULL))
# put all the estimates together to compare
full_join(est_7i,
est_7j) %>%
full_join(est_7l_early) %>%
full_join(est_7l_late) %>%
mutate(across(est_7i:est_7l_late, ~round(., 3))) %>%
knitr::kable(align = "rccc")
Params | est_7i | est_7j | est_7l_early | est_7l_late |
---|---|---|---|---|
(Intercept) | 0.013 | 0.013 | 0.013 | 0.009 |
fu2 | 3.555 | 3.555 | 3.659 | 3.465 |
fu3 | 3.693 | 3.694 | 4.195 | 3.186 |
fu4 | 2.932 | 2.932 | 2.399 | 3.557 |
fu5 | 2.448 | 2.448 | 2.058 | 2.980 |
fu6 | 2.256 | 2.256 | 2.108 | 2.479 |
fu7 | 1.797 | 1.797 | 1.658 | 2.039 |
fu8 | 1.289 | 1.288 | 1.148 | 1.596 |
fu9 | 1.439 | 1.439 | 1.512 | 1.003 |
fu10 | 0.796 | 0.796 | 0.766 | 0.803 |
agegrp45-59 | 1.328 | 1.327 | 1.442 | 1.186 |
agegrp60-74 | 1.862 | 1.861 | 1.812 | 1.930 |
agegrp75+ | 3.400 | 3.400 | 2.782 | 3.992 |
sexFemale | 0.588 | 0.603 | 0.617 | 0.555 |
year8594Diagnosed 85-94 | 0.722 | 0.741 | NA | NA |
year8594Diagnosed 85-94:sexFemale | NA | 0.944 | NA | NA |
By fitting stratified models we get slightly different estimates (0.617 and 0.555) since the models stratified by calendar period imply that all estimates are modified by calendar period. That is, we are actually estimating the following model:
# Poisson-regression with effects specific for diagnosis period
poisson7l2 <- glm(death_cancer ~ fu + fu:year8594 + agegrp + agegrp:year8594
+ sex + sex:year8594 + offset(log(pt)),
family = poisson, data = melanoma.l2.spl)
eform(poisson7l2)
## exp(beta) 2.5 % 97.5 %
## (Intercept) 0.01290368 0.008761852 0.01900338
## fu2 3.65871657 2.481529705 5.39433679
## fu3 4.19499847 2.849324333 6.17620534
## fu4 2.39915022 1.570376752 3.66531265
## fu5 2.05775940 1.322929450 3.20075553
## fu6 2.10802431 1.349942800 3.29181835
## fu7 1.65830221 1.029983069 2.66991399
## fu8 1.14804717 0.673627936 1.95658794
## fu9 1.51184461 0.917092926 2.49230372
## fu10 0.76587284 0.409489206 1.43242168
## agegrp45-59 1.44229095 1.137497545 1.82875400
## agegrp60-74 1.81153338 1.433707489 2.28892799
## agegrp75+ 2.78152106 2.059922700 3.75589792
## sexFemale 0.61658148 0.518502522 0.73321286
## fu1:year8594Diagnosed 85-94 0.69349480 0.400391639 1.20116153
## fu2:year8594Diagnosed 85-94 0.65686156 0.445850496 0.96773945
## fu3:year8594Diagnosed 85-94 0.52676828 0.353558146 0.78483503
## fu4:year8594Diagnosed 85-94 1.02830166 0.663463181 1.59376485
## fu5:year8594Diagnosed 85-94 1.00420087 0.620801647 1.62438260
## fu6:year8594Diagnosed 85-94 0.81552432 0.482996343 1.37698747
## fu7:year8594Diagnosed 85-94 0.85250167 0.464550575 1.56443481
## fu8:year8594Diagnosed 85-94 0.96413334 0.455496911 2.04074511
## fu9:year8594Diagnosed 85-94 0.46016747 0.171415140 1.23532904
## fu10:year8594Diagnosed 85-94 0.72704039 0.161239682 3.27827322
## year8594Diagnosed 85-94:agegrp45-59 0.82239328 0.563992308 1.19918426
## year8594Diagnosed 85-94:agegrp60-74 1.06558347 0.742295370 1.52967158
## year8594Diagnosed 85-94:agegrp75+ 1.43517937 0.944316163 2.18119726
## year8594Diagnosed 85-94:sexFemale 0.90008164 0.695365474 1.16506642
Split by month and fit a model to smooth for time using natural splines, adjusting for age group and calendar period. Plot the baseline hazard.
## Split follow up by month using the dataset where we restricted follow-up to 10 years
melanoma.l2.spl.m <- survSplit(Surv(surv_mm, death_cancer) ~ ., data = melanoma.l2,
cut = 1:120, start = "tstart", end = "tstop")
# take middle of intervals to be used within splines
# note that in this time-split dataset the mid-point is always >0,
# e.g. for interval 0m-0.5m, the mid-point is 0.25m (this is )
melanoma.l2.spl.m <- transform(melanoma.l2.spl.m,
risk_time = tstop - tstart,
mid = (tstop + tstart)/2)
# look at two patients before time-split
melanoma.l2 %>%
filter(id %in% c(1926, 3073))
## id sex agegrp year8594 surv_mm status death_cancer
## 1 1926 Male 75+ Diagnosed 75-84 3.5 Dead: cancer 1
## 2 3073 Female 75+ Diagnosed 75-84 3.5 Dead: other 0
## surv_yy1
## 1 0.2916667
## 2 0.2916667
# look at two patients after time-split
melanoma.l2.spl.m %>%
filter(id %in% c(1926, 3073))
## id sex agegrp year8594 status surv_yy1 tstart tstop
## 1 1926 Male 75+ Diagnosed 75-84 Dead: cancer 0.2916667 0 1.0
## 2 1926 Male 75+ Diagnosed 75-84 Dead: cancer 0.2916667 1 2.0
## 3 1926 Male 75+ Diagnosed 75-84 Dead: cancer 0.2916667 2 3.0
## 4 1926 Male 75+ Diagnosed 75-84 Dead: cancer 0.2916667 3 3.5
## 5 3073 Female 75+ Diagnosed 75-84 Dead: other 0.2916667 0 1.0
## 6 3073 Female 75+ Diagnosed 75-84 Dead: other 0.2916667 1 2.0
## 7 3073 Female 75+ Diagnosed 75-84 Dead: other 0.2916667 2 3.0
## 8 3073 Female 75+ Diagnosed 75-84 Dead: other 0.2916667 3 3.5
## death_cancer risk_time mid
## 1 0 1.0 0.50
## 2 0 1.0 1.50
## 3 0 1.0 2.50
## 4 1 0.5 3.25
## 5 0 1.0 0.50
## 6 0 1.0 1.50
## 7 0 1.0 2.50
## 8 0 0.5 3.25
# fit a poisson model using splines for follow-up time (middle of intervals),
# while adjusting for age, period
library(splines)
poisson7m <- glm(death_cancer ~ ns(mid, df = 4) + agegrp + year8594 + offset(log(risk_time)),
family = poisson, data = melanoma.l2.spl.m)
# the estimated coefficients for splines are not interpretable on their own,
# but they are used for estimating the smooth hazard rates
summary(poisson7m)
##
## Call:
## glm(formula = death_cancer ~ ns(mid, df = 4) + agegrp + year8594 +
## offset(log(risk_time)), family = poisson, data = melanoma.l2.spl.m)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.79787 0.19362 -40.273 < 2e-16 ***
## ns(mid, df = 4)1 1.44147 0.18278 7.886 3.11e-15 ***
## ns(mid, df = 4)2 0.55711 0.21388 2.605 0.009195 **
## ns(mid, df = 4)3 2.96789 0.41709 7.116 1.11e-12 ***
## ns(mid, df = 4)4 -1.01241 0.24398 -4.150 3.33e-05 ***
## agegrp45-59 0.32006 0.09404 3.403 0.000665 ***
## agegrp60-74 0.63310 0.09083 6.970 3.17e-12 ***
## agegrp75+ 1.16187 0.10407 11.165 < 2e-16 ***
## year8594Diagnosed 85-94 -0.29637 0.06603 -4.489 7.17e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 12857 on 390446 degrees of freedom
## Residual deviance: 12495 on 390438 degrees of freedom
## AIC: 14433
##
## Number of Fisher Scoring iterations: 8
# create data frames with covariate patterns for which we want to estimate
# the hazard rates per 1000 person-years (risk_time = 1000*12)
df_cov_early <- data.frame(mid = 0:120, agegrp = "0-44", year8594 = "Diagnosed 75-84",
risk_time = 1000*12)
df_cov_later <- mutate(df_cov_early, year8594 = "Diagnosed 85-94")
# estimate the log baseline hazard rate by using predict function, and then taking exp()
hazpois7m_early <- as.data.frame(predict(poisson7m, newdata = df_cov_early, se.fit = T))
# estimate the log hazard rate by using predict function, and then taking exp()
hazpois7m_later <- as.data.frame(predict(poisson7m, newdata = df_cov_later, se.fit = T))
# create a dataframe with both hazard rates and confidence intervals
df_haz_all <- full_join(
cbind(df_cov_early, hazpois7m_early) %>%
mutate(which_haz = paste0("Age ", agegrp, ", ", year8594)) ,
cbind(df_cov_later, hazpois7m_later) %>%
mutate(which_haz = paste0("Age ", agegrp, ", ", year8594))
) %>%
mutate(mid_y = mid/12,
haz = exp(fit),
lower = exp(fit - 1.96*se.fit),
upper = exp(fit + 1.96*se.fit))
# plot the hazards rates
ggplot(aes(x = mid_y, y = haz, colour = which_haz, fill = which_haz), data = df_haz_all) +
geom_line(linewidth = 1) +
geom_ribbon(aes(ymin = lower, ymax = upper), colour = NA, alpha = 0.3) +
scale_x_continuous(name = "Years since diagnosis",
breaks = 0:10) +
labs(title = "Cancer deaths by years since diagnosis",
y = "Mortality rate per 1000 person-years",
colour = "",
fill = "") +
theme_minimal() +
theme(panel.grid.minor.x = element_blank(),
axis.text = element_text(size = 12),
legend.position = "top")
Now fit a model with time-varying hazard ratio for calendar period (i.e. non-proportional hazards). Plot the time-varying hazard ratio.
poisson7n <- glm(death_cancer ~ ns(mid, df = 4) + agegrp + year8594 +
ifelse(year8594=="Diagnosed 85-94", 1, 0):ns(mid, df = 3) +
offset(log(risk_time)), family = poisson,
data = melanoma.l2.spl.m)
# the estimated coefficients for splines are not interpretable on their own,
# but they are used for estimating the smooth hazard rates
summary(poisson7n)
##
## Call:
## glm(formula = death_cancer ~ ns(mid, df = 4) + agegrp + year8594 +
## ifelse(year8594 == "Diagnosed 85-94", 1, 0):ns(mid, df = 3) +
## offset(log(risk_time)), family = poisson, data = melanoma.l2.spl.m)
##
## Coefficients:
## Estimate
## (Intercept) -7.79533
## ns(mid, df = 4)1 1.33549
## ns(mid, df = 4)2 0.36834
## ns(mid, df = 4)3 3.12608
## ns(mid, df = 4)4 -0.93078
## agegrp45-59 0.32030
## agegrp60-74 0.63369
## agegrp75+ 1.16604
## year8594Diagnosed 85-94 -0.34911
## ifelse(year8594 == "Diagnosed 85-94", 1, 0):ns(mid, df = 3)1 0.89096
## ifelse(year8594 == "Diagnosed 85-94", 1, 0):ns(mid, df = 3)2 -0.37255
## ifelse(year8594 == "Diagnosed 85-94", 1, 0):ns(mid, df = 3)3 -0.69867
## Std. Error z value
## (Intercept) 0.24437 -31.900
## ns(mid, df = 4)1 0.25920 5.152
## ns(mid, df = 4)2 0.24150 1.525
## ns(mid, df = 4)3 0.52387 5.967
## ns(mid, df = 4)4 0.25938 -3.589
## agegrp45-59 0.09405 3.406
## agegrp60-74 0.09084 6.976
## agegrp75+ 0.10406 11.205
## year8594Diagnosed 85-94 0.27807 -1.255
## ifelse(year8594 == "Diagnosed 85-94", 1, 0):ns(mid, df = 3)1 0.33166 2.686
## ifelse(year8594 == "Diagnosed 85-94", 1, 0):ns(mid, df = 3)2 0.73263 -0.509
## ifelse(year8594 == "Diagnosed 85-94", 1, 0):ns(mid, df = 3)3 0.55496 -1.259
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## ns(mid, df = 4)1 2.57e-07 ***
## ns(mid, df = 4)2 0.127203
## ns(mid, df = 4)3 2.41e-09 ***
## ns(mid, df = 4)4 0.000333 ***
## agegrp45-59 0.000660 ***
## agegrp60-74 3.03e-12 ***
## agegrp75+ < 2e-16 ***
## year8594Diagnosed 85-94 0.209305
## ifelse(year8594 == "Diagnosed 85-94", 1, 0):ns(mid, df = 3)1 0.007224 **
## ifelse(year8594 == "Diagnosed 85-94", 1, 0):ns(mid, df = 3)2 0.611099
## ifelse(year8594 == "Diagnosed 85-94", 1, 0):ns(mid, df = 3)3 0.208043
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 12857 on 390446 degrees of freedom
## Residual deviance: 12488 on 390435 degrees of freedom
## AIC: 14432
##
## Number of Fisher Scoring iterations: 9
# estimate the log baseline hazard rate by using predict function, and then taking exp()
hazpois7n_early <- as.data.frame(predict(poisson7n, newdata = df_cov_early, se.fit = T))
# estimate the log hazard rate by using predict function, and then taking exp()
hazpois7n_later <- as.data.frame(predict(poisson7n, newdata = df_cov_later, se.fit = T))
# create a dataframe with both hazard rates and confidence intervals
df_haz_all <- full_join(
cbind(df_cov_early, hazpois7n_early) %>%
mutate(which_haz = paste0("Age ", agegrp, ", ", year8594)) ,
cbind(df_cov_later, hazpois7n_later) %>%
mutate(which_haz = paste0("Age ", agegrp, ", ", year8594))
) %>%
mutate(mid_y = mid/12,
haz = exp(fit),
lower = exp(fit - 1.96*se.fit),
upper = exp(fit + 1.96*se.fit))
# plot the hazards rates
ggplot(aes(x = mid_y, y = haz, colour = which_haz, fill = which_haz), data = df_haz_all) +
geom_line(linewidth = 1) +
geom_ribbon(aes(ymin = lower, ymax = upper), colour = NA, alpha = 0.3) +
scale_x_continuous(name = "Years since diagnosis",
breaks = 0:10) +
labs(title = "Cancer deaths by years since diagnosis",
y = "Mortality rate per 1000 person-years",
colour = "",
fill = "") +
theme_minimal() +
theme(panel.grid.minor.x = element_blank(),
axis.text = element_text(size = 12),
legend.position = "top")
## get log(RR) confidence interval using predictnl (delta method)
library(rstpm2)
pred_RR <- predictnl(poisson7n, function(object)
log(predict(object, newdata = df_cov_later, type = "response")/
predict(object, newdata = df_cov_early, type = "response")))
pred_RR2 <- transform(pred_RR,
time_m = 0:120,
rr = exp(fit),
ci = exp(confint(pred_RR)))
ggplot(pred_RR2, aes(x = time_m/12, y = rr)) +
geom_line(linewidth = 0.8) +
geom_ribbon(aes(ymin = ci.2.5.., ymax = ci.97.5..), alpha = 0.3) +
geom_hline(yintercept = 1, linetype = 2) +
xlab("Time since diagnosis (years)") +
ylab("Rate ratio") +
scale_x_continuous(name = "Years since diagnosis",
breaks = 0:10) +
labs(title = "Cancer deaths by years since diagnosis",
subtitle = "(later vs early period diagnosis)",
y = "Rate ratio") +
theme_minimal() +
theme(panel.grid.minor.x = element_blank(),
axis.text = element_text(size = 12),
legend.position = "top")
## Calculate the rate difference
pred_rd <- predictnl(poisson7n, function(object)
predict(object, newdata = df_cov_later, type = "response") -
predict(object, newdata = df_cov_early, type = "response"))
pred_rd2 <- pred2 <- transform(pred_rd,
time_m = 0:120,
rd = fit,
ci = confint(pred_rd))
ggplot(pred_rd2, aes(x = time_m/12, y = rd)) +
geom_line(linewidth = 0.8) +
geom_ribbon(aes(ymin = ci.2.5.., ymax = ci.97.5..), alpha = 0.3) +
geom_hline(yintercept = 0, linetype = 2) +
xlab("Time since diagnosis (years)") +
ylab("Rate ratio") +
scale_x_continuous(name = "Years since diagnosis",
breaks = 0:10) +
labs(y = "Rate difference") +
theme_minimal() +
theme(panel.grid.minor.x = element_blank(),
axis.text = element_text(size = 12),
legend.position = "top")