Calibration in case-control sampling

National Wilms’ Tumour Group data

set.seed(2017-12-3)
nwts <- read.table("nwts-share.txt", header=TRUE)
summary(nwts)

##       trel               tsur              relaps            dead       
##  Min.   : 0.01095   Min.   : 0.01095   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 4.94182   1st Qu.: 6.24093   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 9.77139   Median :10.36003   Median :0.0000   Median :0.0000  
##  Mean   : 9.64874   Mean   :10.32634   Mean   :0.1709   Mean   :0.1134  
##  3rd Qu.:14.01095   3rd Qu.:14.42847   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :22.50240   Max.   :22.50240   Max.   :1.0000   Max.   :1.0000  
##      study           stage           histol           instit       
##  Min.   :3.000   Min.   :1.000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:3.000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :4.000   Median :2.000   Median :0.0000   Median :0.00000  
##  Mean   :3.573   Mean   :2.079   Mean   :0.1121   Mean   :0.09757  
##  3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :4.000   Max.   :4.000   Max.   :1.0000   Max.   :1.00000  
##       age               yr          specwgt          tumdiam     
##  Min.   : 0.000   Min.   :1979   Min.   :  10.0   Min.   : 1.00  
##  1st Qu.: 1.583   1st Qu.:1983   1st Qu.: 321.0   1st Qu.: 9.00  
##  Median : 3.083   Median :1987   Median : 510.0   Median :11.00  
##  Mean   : 3.534   Mean   :1987   Mean   : 604.6   Mean   :11.21  
##  3rd Qu.: 4.833   3rd Qu.:1991   3rd Qu.: 800.0   3rd Qu.:14.00  
##  Max.   :15.917   Max.   :1995   Max.   :4400.0   Max.   :33.00

New variables

A linear spline in age

nwts$age1 <- with(nwts, pmin(age, 1))
nwts$age2 <- with(nwts, pmax(age, 1))

The full-cohort model

Histology:age interaction and stage:tumour-diameter interaction

fullmodel <- glm(relaps~histol*(age1+age2)+ I(stage>2)*tumdiam, family=binomial, data=nwts)
summary(fullmodel)

## 
## Call:
## glm(formula = relaps ~ histol * (age1 + age2) + I(stage > 2) * 
##     tumdiam, family = binomial, data = nwts)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1478  -0.5989  -0.4811  -0.3955   2.4386  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -2.61048    0.34241  -7.624 2.46e-14 ***
## histol                    5.66253    0.97790   5.790 7.02e-09 ***
## age1                     -0.73020    0.34976  -2.088 0.036821 *  
## age2                      0.12198    0.01920   6.354 2.09e-10 ***
## I(stage > 2)TRUE          1.50690    0.29682   5.077 3.84e-07 ***
## tumdiam                   0.07219    0.01591   4.539 5.66e-06 ***
## histol:age1              -4.19225    1.05085  -3.989 6.62e-05 ***
## histol:age2              -0.03064    0.04752  -0.645 0.519070    
## I(stage > 2)TRUE:tumdiam -0.08947    0.02355  -3.798 0.000146 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3580.5  on 3914  degrees of freedom
## Residual deviance: 3229.3  on 3906  degrees of freedom
## AIC: 3247.3
## 
## Number of Fisher Scoring iterations: 5

Case-control sample

Here we take all the cases and a random sample of controls

nwts$id <- 1:nrow(nwts)
cases <- subset(nwts, relaps==1)
noncases <- subset(nwts, relaps==0)
controlsample <- sample(noncases$id, nrow(cases))
ccsample<- rbind(cases, noncases[noncases$id %in% controlsample,])
ccsample$weight<-with(ccsample, ifelse(relaps==1, 1, nrow(noncases)/nrow(cases)))

We can compare the maximum likelihood estimator and the survey estimator:

library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

ccmle <- glm(relaps~offset(log(weight))+histol*(age1+age2)+ I(stage>2)*tumdiam, family=binomial, data=ccsample)
summary(ccmle)

## 
## Call:
## glm(formula = relaps ~ offset(log(weight)) + histol * (age1 + 
##     age2) + I(stage > 2) * tumdiam, family = binomial, data = ccsample)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6619  -1.2407  -0.2776   1.4364   2.3693  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -2.18981    0.43829  -4.996 5.85e-07 ***
## histol                    4.46106    1.17825   3.786 0.000153 ***
## age1                     -1.13596    0.45543  -2.494 0.012623 *  
## age2                      0.21786    0.02899   7.515 5.68e-14 ***
## I(stage > 2)TRUE          2.24581    0.43128   5.207 1.92e-07 ***
## tumdiam                   0.08634    0.02093   4.125 3.71e-05 ***
## histol:age1              -2.08221    1.30854  -1.591 0.111553    
## histol:age2              -0.09580    0.08115  -1.181 0.237765    
## I(stage > 2)TRUE:tumdiam -0.12564    0.03408  -3.686 0.000228 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3114.9  on 1337  degrees of freedom
## Residual deviance: 2723.9  on 1329  degrees of freedom
## AIC: 2741.9
## 
## Number of Fisher Scoring iterations: 5

survey_cc <- svydesign(id=~1, weights=~weight, strata=~relaps, data=ccsample)
ccest <- svyglm(relaps~histol*(age1+age2)+ I(stage>2)*tumdiam, family=quasibinomial, design=survey_cc)
summary(ccest)

## 
## Call:
## svyglm(formula = relaps ~ histol * (age1 + age2) + I(stage > 
##     2) * tumdiam, family = quasibinomial, design = survey_cc)
## 
## Survey design:
## svydesign(id = ~1, weights = ~weight, strata = ~relaps, data = ccsample)
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -2.52284    0.42664  -5.913 4.26e-09 ***
## histol                    3.48903    1.03643   3.366 0.000783 ***
## age1                     -0.84587    0.42832  -1.975 0.048492 *  
## age2                      0.14745    0.02757   5.348 1.05e-07 ***
## I(stage > 2)TRUE          1.58083    0.41423   3.816 0.000142 ***
## tumdiam                   0.06201    0.02136   2.904 0.003750 ** 
## histol:age1              -1.87789    1.15740  -1.623 0.104933    
## histol:age2              -0.05041    0.08116  -0.621 0.534633    
## I(stage > 2)TRUE:tumdiam -0.08340    0.03295  -2.531 0.011494 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.017942)
## 
## Number of Fisher Scoring iterations: 4

The two seem fairly comparable: we know that asymptotically the maximum likelihood estimator must be better, but the difference is small enough to not show up in a single comparison

round(cbind(coef(ccmle), coef(ccest))-coef(fullmodel),3)

##                            [,1]   [,2]
## (Intercept)               0.421  0.088
## histol                   -1.201 -2.173
## age1                     -0.406 -0.116
## age2                      0.096  0.025
## I(stage > 2)TRUE          0.739  0.074
## tumdiam                   0.014 -0.010
## histol:age1               2.110  2.314
## histol:age2              -0.065 -0.020
## I(stage > 2)TRUE:tumdiam -0.036  0.006

The simple survey estimator does not use the full cohort; we can declare a twophase object that does. We do not need to specify weights because the software can work out what they are.

nwts_twophase <- twophase(id=list(~1,~1), strata=list(NULL, ~relaps), subset=~I((relaps==1)| id %in% controlsample), data=nwts)
twophaseest <- svyglm(relaps~histol*(age1+age2)+ I(stage>2)*tumdiam, family=quasibinomial, design=nwts_twophase)
summary(twophaseest)

## 
## Call:
## svyglm(formula = relaps ~ histol * (age1 + age2) + I(stage > 
##     2) * tumdiam, family = quasibinomial, design = nwts_twophase)
## 
## Survey design:
## twophase2(id = id, strata = strata, probs = probs, fpc = fpc, 
##     subset = subset, data = data)
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -2.52284    0.42862  -5.886 5.01e-09 ***
## histol                    3.48903    1.03619   3.367 0.000781 ***
## age1                     -0.84587    0.42811  -1.976 0.048382 *  
## age2                      0.14745    0.02756   5.350 1.04e-07 ***
## I(stage > 2)TRUE          1.58083    0.41407   3.818 0.000141 ***
## tumdiam                   0.06201    0.02135   2.905 0.003735 ** 
## histol:age1              -1.87789    1.15711  -1.623 0.104846    
## histol:age2              -0.05041    0.08114  -0.621 0.534521    
## I(stage > 2)TRUE:tumdiam -0.08340    0.03294  -2.532 0.011462 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.017942)
## 
## Number of Fisher Scoring iterations: 4

We still aren’t using the whole cohort for anything, so the two analyses are almost identical

Using the whole cohort

We’ll try to use the whole cohort now. First, just use instit instead of histol

First, fit the model to the full data

phase1model <- glm(relaps~instit*(age1+age2)+ I(stage>2)*tumdiam, family=binomial, data=nwts)

Extract the influence functions and create a new design object

inffun<-model.matrix(phase1model)*resid(phase1model, type="response")
colnames(inffun)<-paste0("if",1:ncol(inffun))
aug_twophase <- twophase(id=list(~1,~1), strata=list(NULL, ~relaps), subset=~I((relaps==1)| id %in% controlsample), data=cbind(nwts,inffun), method="simple")

Calibrate, and fit the model of interest (ie, with histol) to the calibrated sample

calformula <- make.formula(colnames(inffun))
cal_twophase <- calibrate(aug_twophase, calformula, phase=2)
svyest_instit<-svyglm(relaps~histol*(age1+age2)+ I(stage>2)*tumdiam, family=quasibinomial, design=cal_twophase)
summary(svyest_instit)

## 
## Call:
## svyglm(formula = relaps ~ histol * (age1 + age2) + I(stage > 
##     2) * tumdiam, family = quasibinomial, design = cal_twophase)
## 
## Survey design:
## calibrate(aug_twophase, calformula, phase = 2)
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -2.57714    0.35437  -7.272 6.01e-13 ***
## histol                    5.08895    1.24088   4.101 4.36e-05 ***
## age1                     -0.71401    0.34869  -2.048 0.040788 *  
## age2                      0.11668    0.02090   5.584 2.85e-08 ***
## I(stage > 2)TRUE          1.44579    0.31421   4.601 4.60e-06 ***
## tumdiam                   0.06940    0.01674   4.146 3.60e-05 ***
## histol:age1              -3.83311    1.32028  -2.903 0.003754 ** 
## histol:age2               0.01252    0.05966   0.210 0.833801    
## I(stage > 2)TRUE:tumdiam -0.08282    0.02488  -3.329 0.000895 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.011699)
## 
## Number of Fisher Scoring iterations: 4

Comparing the uncalibrated and calibrated estimates, the coefficients have nearly all moved closer to the true full cohort value.

round(cbind(coef(twophaseest), coef(svyest_instit))-coef(fullmodel),3)

##                            [,1]   [,2]
## (Intercept)               0.088  0.033
## histol                   -2.173 -0.574
## age1                     -0.116  0.016
## age2                      0.025 -0.005
## I(stage > 2)TRUE          0.074 -0.061
## tumdiam                  -0.010 -0.003
## histol:age1               2.314  0.359
## histol:age2              -0.020  0.043
## I(stage > 2)TRUE:tumdiam  0.006  0.007

Calibration by imputation

It is always valid to just use a surrogate such as instit in calibration, but it is probably not optimal. The attenuation bias in using a mismeasured predictor translates into a loss of precision in the calibrated estimate. We can try to construct a regression imputation of histology instead:

impmodel<-svyglm(histol~instit*(relaps+I(stage>3))+I(age>10)+factor(study),family=quasibinomial,design=nwts_twophase)
nwts$imphistol <-as.vector(predict(impmodel,newdata=nwts,type="response",se.fit=FALSE))
with(nwts, by(imphistol, histol, summary))

## histol: 0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01028 0.01536 0.02794 0.04257 0.02794 0.97556 
## -------------------------------------------------------- 
## histol: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.01028 0.10718 0.83093 0.66956 0.95587 0.97556

We now proceed as before. In particular, note that it is important to use imphistol for all observations in the phase-1 model, even those where histol is available – it was not available at phase 1.

phase1model_imp <- glm(relaps~imphistol*(age1+age2)+ I(stage>2)*tumdiam, family=binomial, data=nwts)

Extract the influence functions and create a new design object

inffun_imp<-model.matrix(phase1model_imp)*resid(phase1model_imp, type="response")
colnames(inffun_imp)<-paste0("if",1:ncol(inffun_imp))
aug_twophase_imp <- twophase(id=list(~1,~1), strata=list(NULL, ~relaps), subset=~I((relaps==1)| id %in% controlsample), data=cbind(nwts,inffun_imp), method="simple")

Calibrate, and fit the model of interest

calformula <- make.formula(colnames(inffun_imp))
cal_twophase_imp <- calibrate(aug_twophase_imp, calformula, phase=2)
svyest_imp<-svyglm(relaps~histol*(age1+age2)+ I(stage>2)*tumdiam, family=quasibinomial, design=cal_twophase_imp)
summary(svyest_imp)

## 
## Call:
## svyglm(formula = relaps ~ histol * (age1 + age2) + I(stage > 
##     2) * tumdiam, family = quasibinomial, design = cal_twophase_imp)
## 
## Survey design:
## calibrate(aug_twophase_imp, calformula, phase = 2)
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -2.56915    0.35474  -7.242 7.45e-13 ***
## histol                    5.05091    1.27870   3.950 8.22e-05 ***
## age1                     -0.74684    0.34810  -2.146 0.032093 *  
## age2                      0.11751    0.02094   5.611 2.44e-08 ***
## I(stage > 2)TRUE          1.49480    0.31871   4.690 3.01e-06 ***
## tumdiam                   0.06985    0.01702   4.104 4.31e-05 ***
## histol:age1              -3.81602    1.36194  -2.802 0.005154 ** 
## histol:age2               0.03141    0.06202   0.507 0.612561    
## I(stage > 2)TRUE:tumdiam -0.08516    0.02529  -3.367 0.000781 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.011895)
## 
## Number of Fisher Scoring iterations: 4

There has been a slight additional improvement; slight, because instit is overwhelming the best predictor of histology.

round(cbind(coef(twophaseest), coef(svyest_instit), coef(svyest_imp))-coef(fullmodel),3)

##                            [,1]   [,2]   [,3]
## (Intercept)               0.088  0.033  0.041
## histol                   -2.173 -0.574 -0.612
## age1                     -0.116  0.016 -0.017
## age2                      0.025 -0.005 -0.004
## I(stage > 2)TRUE          0.074 -0.061 -0.012
## tumdiam                  -0.010 -0.003 -0.002
## histol:age1               2.314  0.359  0.376
## histol:age2              -0.020  0.043  0.062
## I(stage > 2)TRUE:tumdiam  0.006  0.007  0.004

Multiple imputation

We can take multiple predictions from the fitted imputation model, produce influence functions for each of them, then average the influence functions over the multiple imputations. We need to use the full influence function, not just the score function as before, because the matrix it’s multiplied by can vary between imputations.

inffun_mi<-matrix(0,nrow=nrow(nwts),ncol=ncol(inffun_imp))
M<-50
for(i in 1:M){
  nwts$mihistol <-rbinom(nrow(nwts),1, as.vector(predict(impmodel,newdata=nwts,type="response",se.fit=FALSE)))
  phase1model_imp <- glm(relaps~mihistol*(age1+age2)+ I(stage>2)*tumdiam, family=binomial, data=nwts)
  inffun_mi<-inffun_mi+(model.matrix(phase1model_imp)*resid(phase1model_imp, type="response"))%*%vcov(phase1model)
}
inffun_mi<-inffun_mi/M
colnames(inffun_mi)<-paste0("if",1:ncol(inffun_mi))

Now, do the calibration and fit the model of interest

aug_twophase_mi <- twophase(id=list(~1,~1), strata=list(NULL, ~relaps), subset=~I((relaps==1)| id %in% controlsample), data=cbind(nwts,inffun_mi), method="simple")
calformula <- make.formula(colnames(inffun_mi))
cal_twophase_mi <- calibrate(aug_twophase_mi, calformula, phase=2)
svyest_mi<-svyglm(relaps~histol*(age1+age2)+ I(stage>2)*tumdiam, family=quasibinomial, design=cal_twophase_mi)
summary(svyest_mi)

## 
## Call:
## svyglm(formula = relaps ~ histol * (age1 + age2) + I(stage > 
##     2) * tumdiam, family = quasibinomial, design = cal_twophase_mi)
## 
## Survey design:
## calibrate(aug_twophase_mi, calformula, phase = 2)
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -2.566843   0.356013  -7.210 9.36e-13 ***
## histol                    4.770679   1.056824   4.514 6.92e-06 ***
## age1                     -0.725544   0.349729  -2.075  0.03822 *  
## age2                      0.117367   0.020903   5.615 2.39e-08 ***
## I(stage > 2)TRUE          1.437374   0.312746   4.596 4.72e-06 ***
## tumdiam                   0.068681   0.016790   4.091 4.56e-05 ***
## histol:age1              -3.466632   1.140661  -3.039  0.00242 ** 
## histol:age2               0.009168   0.059254   0.155  0.87706    
## I(stage > 2)TRUE:tumdiam -0.081296   0.024763  -3.283  0.00105 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.012549)
## 
## Number of Fisher Scoring iterations: 4

round(cbind(coef(twophaseest), coef(svyest_instit),coef(svyest_imp),coef(svyest_mi))-coef(fullmodel),3)

##                            [,1]   [,2]   [,3]   [,4]
## (Intercept)               0.088  0.033  0.041  0.044
## histol                   -2.173 -0.574 -0.612 -0.892
## age1                     -0.116  0.016 -0.017  0.005
## age2                      0.025 -0.005 -0.004 -0.005
## I(stage > 2)TRUE          0.074 -0.061 -0.012 -0.070
## tumdiam                  -0.010 -0.003 -0.002 -0.004
## histol:age1               2.314  0.359  0.376  0.726
## histol:age2              -0.020  0.043  0.062  0.040
## I(stage > 2)TRUE:tumdiam  0.006  0.007  0.004  0.008

Simulation

Finally, what we really want for comparison purposes is a summary over repeated case-control samples. We run the entire procedure above 500 times.

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

The median difference from the full-cohort regression is

round(medianbias,3)

##                             MLE Weighted instit    imp     MI
## (Intercept)               1.660   -0.026 -0.002 -0.010  0.000
## histol                   -0.056   -0.025  0.430  0.352  0.266
## age1                      0.005   -0.005  0.001  0.007 -0.001
## age2                      0.000    0.002 -0.001 -0.001  0.000
## I(stage > 2)TRUE         -0.117    0.029  0.010  0.017  0.006
## tumdiam                  -0.008    0.001  0.000  0.001  0.000
## histol:age1               0.007   -0.046 -0.339 -0.308 -0.218
## histol:age2               0.005    0.007  0.006  0.005  0.009
## I(stage > 2)TRUE:tumdiam  0.006   -0.003 -0.001 -0.001 -0.001

And the median absolute deviation (an outlier-resistant standard deviation) is

round(mad,3)

##                            MLE Weighted instit   imp    MI
## (Intercept)              0.230    0.233  0.032 0.044 0.038
## histol                   1.426    1.529  0.784 0.861 1.236
## age1                     0.272    0.275  0.032 0.036 0.038
## age2                     0.017    0.018  0.005 0.005 0.005
## I(stage > 2)TRUE         0.248    0.274  0.076 0.090 0.070
## tumdiam                  0.011    0.013  0.003 0.004 0.003
## histol:age1              1.511    1.587  0.910 0.920 1.248
## histol:age2              0.059    0.065  0.057 0.059 0.059
## I(stage > 2)TRUE:tumdiam 0.019    0.021  0.006 0.007 0.006

Thomas Lumley

1/15/2018