Exercise 25. Localised melanoma : generating and analysing a nested case-control (NCC) study

library(biostat3) # cox and conditional logistic analyses
library(Epi)      # sample a nested case-control study from a cohort

## Get the data for exercise 25 and have a look at it
mel <- subset(biostat3::melanoma, stage=="Localised") |>
    transform(dc = (status=="Dead: cancer" & surv_mm<120)+0,
              surv_10y = pmin(120, surv_mm))
with(mel,table(dc, status))

str(subset(mel, select=c(sex,year8594,agegrp))) # Check structure of risk factors/confounders

## 'data.frame':    5318 obs. of  3 variables:
##  $ sex     : Factor w/ 2 levels "Male","Female": 2 2 2 2 2 2 2 1 2 2 ...
##  $ year8594: Factor w/ 2 levels "Diagnosed 75-84",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ agegrp  : Factor w/ 4 levels "0-44","45-59",..: 4 4 4 4 4 4 4 4 4 4 ...

out_coh <- coxph(Surv(surv_10y,dc) ~ sex + year8594 + agegrp, data = mel)
summary(out_coh)

## Call:
## coxph(formula = Surv(surv_10y, dc) ~ sex + year8594 + agegrp, 
##     data = mel)
## 
##   n= 5318, number of events= 960 
## 
##                             coef exp(coef) se(coef)      z Pr(>|z|)    
## sexFemale               -0.53061   0.58825  0.06545 -8.107 5.19e-16 ***
## year8594Diagnosed 85-94 -0.33339   0.71649  0.06618 -5.037 4.72e-07 ***
## agegrp45-59              0.28283   1.32688  0.09417  3.003  0.00267 ** 
## agegrp60-74              0.62006   1.85904  0.09088  6.823 8.90e-12 ***
## agegrp75+                1.21801   3.38045  0.10443 11.663  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                         exp(coef) exp(-coef) lower .95 upper .95
## sexFemale                  0.5882     1.7000    0.5174    0.6688
## year8594Diagnosed 85-94    0.7165     1.3957    0.6293    0.8157
## agegrp45-59                1.3269     0.7536    1.1032    1.5959
## agegrp60-74                1.8590     0.5379    1.5557    2.2215
## agegrp75+                  3.3804     0.2958    2.7547    4.1483
## 
## Concordance= 0.646  (se = 0.009 )
## Likelihood ratio test= 212.7  on 5 df,   p=<2e-16
## Wald test            = 217.9  on 5 df,   p=<2e-16
## Score (logrank) test = 226.8  on 5 df,   p=<2e-16

(a)

There are 5318 individuals in the study that we would need to collect data for if we were to use the complete cohort of patients.

print(n_ind <- length(mel$id))

## [1] 5318

(b)

960 cancer patients die from melanoma during the first 10 years of follow-up.

table(mel$dc, useNA="always")

## 
##    0    1 <NA> 
## 4358  960    0

print(ncase <-  table (mel$dc, useNA="always")[2])

##   1 
## 960

(c1)

set.seed(12345)
nccdata <- ccwc(entry=0, exit=surv_10y , fail=dc, origin=0, controls=1,
                include=list(sex,year8594,agegrp,dc,id), data=mel)

Sampling risk sets: ………………………………………………………………………………………………………

tail(nccdata, 8) |> kable("html")

	Set	Map	Time	Fail	sex	year8594	agegrp	dc	id
1913	114	2674	87.5	1	Female	Diagnosed 85-94	0-44	1	3784
1914	114	1566	87.5	0	Female	Diagnosed 75-84	0-44	0	2202
1915	115	4275	98.5	1	Male	Diagnosed 85-94	60-74	1	6168
1916	115	242	98.5	0	Male	Diagnosed 75-84	60-74	0	348
1917	116	5287	2.5	1	Male	Diagnosed 85-94	45-59	1	7713
1918	116	625	2.5	0	Male	Diagnosed 75-84	45-59	0	894
1919	117	5314	119.5	1	Male	Diagnosed 85-94	60-74	1	7753
1920	117	1364	119.5	0	Male	Diagnosed 75-84	0-44	0	1922

(c2)

out_ncc <- clogit(Fail ~ sex + year8594 + agegrp + strata(Set), data=nccdata)
summary(out_ncc)

## Call:
## coxph(formula = Surv(rep(1, 1920L), Fail) ~ sex + year8594 + 
##     agegrp + strata(Set), data = nccdata, method = "exact")
## 
##   n= 1920, number of events= 960 
## 
##                             coef exp(coef) se(coef)      z Pr(>|z|)    
## sexFemale               -0.58162   0.55899  0.09588 -6.066 1.31e-09 ***
## year8594Diagnosed 85-94 -0.23264   0.79244  0.09646 -2.412 0.015877 *  
## agegrp45-59              0.41896   1.52038  0.12593  3.327 0.000878 ***
## agegrp60-74              0.70976   2.03351  0.12527  5.666 1.46e-08 ***
## agegrp75+                1.35147   3.86310  0.16460  8.210  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                         exp(coef) exp(-coef) lower .95 upper .95
## sexFemale                  0.5590     1.7889    0.4632    0.6746
## year8594Diagnosed 85-94    0.7924     1.2619    0.6559    0.9574
## agegrp45-59                1.5204     0.6577    1.1878    1.9460
## agegrp60-74                2.0335     0.4918    1.5908    2.5994
## agegrp75+                  3.8631     0.2589    2.7979    5.3339
## 
## Concordance= 0.653  (se = 0.015 )
## Likelihood ratio test= 109.1  on 5 df,   p=<2e-16
## Wald test            = 100.9  on 5 df,   p=<2e-16
## Score (logrank) test = 106.3  on 5 df,   p=<2e-16

(d)

print(n_uni <- length(unique(nccdata$id)))

## [1] 1752

n_uni

## [1] 1752

(e)

Note that, since every nested case-control study is different, the parameter estimates you obtain will not be identical to those above. However, the hazard ratios from the two models should be very similar. The standard errors are slightly larger for the nested case-control study since the estimates are based on a sample from the full cohort. Loss of precision is the trade-off we have to make when designing a nested case-control study. The precision can be improved by adding more controls to each case.

library(broom)
cat("Full cohort:\n")

## Full cohort:

print(tidy_coh <- tidy(out_coh, conf.int=TRUE, exponentiate=TRUE))

## # A tibble: 5 × 7
##   term                    estimate std.error statistic    p.value conf.low conf.high
##   <chr>                      <dbl>     <dbl>     <dbl>      <dbl>    <dbl>     <dbl>
## 1 sexFemale                0.58825  0.065450   -8.1070 5.1870e-16  0.51743   0.66876
## 2 year8594Diagnosed 85-94  0.71649  0.066183   -5.0375 4.7174e- 7  0.62932   0.81573
## 3 agegrp45-59              1.3269   0.094173    3.0033 2.6706e- 3  1.1032    1.5959 
## 4 agegrp60-74              1.8590   0.090876    6.8232 8.9042e-12  1.5557    2.2215 
## 5 agegrp75+                3.3804   0.10443    11.663  1.9694e-31  2.7547    4.1483

cat("Nested case-control study:\n")

## Nested case-control study:

print(tidy_ncc <- tidy(out_ncc, conf.int=TRUE, exponentiate=TRUE))

## # A tibble: 5 × 7
##   term                    estimate std.error statistic    p.value conf.low conf.high
##   <chr>                      <dbl>     <dbl>     <dbl>      <dbl>    <dbl>     <dbl>
## 1 sexFemale                0.55899  0.095884   -6.0659 1.3125e- 9  0.46322   0.67456
## 2 year8594Diagnosed 85-94  0.79244  0.096459   -2.4117 1.5877e- 2  0.65594   0.95736
## 3 agegrp45-59              1.5204   0.12593     3.3269 8.7804e- 4  1.1878    1.9460 
## 4 agegrp60-74              2.0335   0.12527     5.6660 1.4615e- 8  1.5908    2.5994 
## 5 agegrp75+                3.8631   0.16460     8.2104 2.2041e-16  2.7979    5.3339

tibble(term = tidy_coh$term, variance.ratio = (tidy_ncc$std.error/tidy_coh$std.error)^2)

## # A tibble: 5 × 2
##   term                    variance.ratio
##   <chr>                            <dbl>
## 1 sexFemale                       2.1462
## 2 year8594Diagnosed 85-94         2.1242
## 3 agegrp45-59                     1.7881
## 4 agegrp60-74                     1.9001
## 5 agegrp75+                       2.4843

tidy_coh <- tidy(out_coh)
tidy_ncc <- tidy(out_ncc)
f <- function(x,mu,sigma) {
    mu=as.numeric(mu)
    sigma=as.numeric(sigma)
    xp=exp(mu+x*sigma)
    cbind(x=xp,y=dlnorm(xp,mu,sigma))
}
x <- seq(-4,4,length=301)
par(mfrow=c(2,3))
for (i in 1:5) {
    f_coh = f(x,tidy_coh[i,"estimate"], tidy_coh[i,"std.error"])
    f_ncc = f(x,tidy_ncc[i,"estimate"], tidy_ncc[i,"std.error"])
    plot(rbind(f_coh,f_ncc), type="n", xlab="Hazard ratio", ylab="Density",
         main=as.character(tidy_coh$term[i]))
    polygon(f_coh,
            col=alpha("green",0.2), border=alpha("green",0.2))
    polygon(f_ncc,
            col=alpha("blue",0.2), border=alpha("blue",0.2))
}
legend("topright", c("Cohort","NCC"), col=c(alpha("green",0.2),alpha("blue",0.2)), lwd=5,
       bty="n")

(f)

We see that there is sampling variation in the parameter estimates from the five nested case-control studies but they are centered on the full cohort estimate. We see that the standard errors of the estimates from the nested case-control studies are larger than for the full cohort but there is some sampling variation.

set.seed(54321)
M <- 20
tidys_ncc <- lapply(1:M, function(i)
    ccwc(entry=0, exit=surv_10y , fail=dc, origin=0, controls=1,
         include=list(sex,year8594,agegrp), data=mel, silent=TRUE) |>
    clogit(formula = Fail ~ sex + year8594 + agegrp + strata(Set)) |> 
    suppressWarnings() |> tidy())
tidy_coh <- tidy(out_coh)
f <- function(x,mu,sigma) {
    mu=as.numeric(mu)
    sigma=as.numeric(sigma)
    xp=exp(mu+x*sigma)
    cbind(x=xp,y=dlnorm(xp,mu,sigma))
}
x <- seq(-4,4,length=301)
par(mfrow=c(2,3))
for (i in 1:5) {
    f_coh = f(x,tidy_coh[i,"estimate"], tidy_coh[i,"std.error"])
    f_ncc = lapply(tidys_ncc, function(object) 
        f(x,object[i,"estimate"], object[i,"std.error"]))
    plot(do.call(what=rbind,c(list(f_coh),f_ncc)), type="n", xlab="Hazard ratio", ylab="Density",
         main=as.character(tidy_coh$term[i]))
    for (j in 1:length(tidys_ncc))
        polygon(f_ncc[[j]],
                col=alpha("blue",0.02), border=alpha("blue",0.02))
    polygon(f_coh,
            col=alpha("green",0.2), border=alpha("green",0.2))
}
plot(0:1,0:1,type="n",axes=FALSE, xlab="",ylab="")
legend("center", c("Cohort","NCC"), col=c(alpha("green",0.2),alpha("blue",0.2)), lwd=5,
       bty="n")

Biostatistics III in R

Code by Bénédicte Delcoigne and Mark Clements

Exercise 25. Localised melanoma : generating and analysing a nested case-control (NCC) study

(a)

(b)

(c1)

(c2)

(d)

(e)

(f)