Calibration in case-control sampling
Thomas Lumley
1/15/2018
National Wilms’ Tumour Group data
set.seed(2017-12-3)
nwts <- read.table("nwts-share.txt", header=TRUE)
summary(nwts)
## trel tsur relaps dead
## Min. : 0.01095 Min. : 0.01095 Min. :0.0000 Min. :0.0000
## 1st Qu.: 4.94182 1st Qu.: 6.24093 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 9.77139 Median :10.36003 Median :0.0000 Median :0.0000
## Mean : 9.64874 Mean :10.32634 Mean :0.1709 Mean :0.1134
## 3rd Qu.:14.01095 3rd Qu.:14.42847 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :22.50240 Max. :22.50240 Max. :1.0000 Max. :1.0000
## study stage histol instit
## Min. :3.000 Min. :1.000 Min. :0.0000 Min. :0.00000
## 1st Qu.:3.000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :4.000 Median :2.000 Median :0.0000 Median :0.00000
## Mean :3.573 Mean :2.079 Mean :0.1121 Mean :0.09757
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :4.000 Max. :4.000 Max. :1.0000 Max. :1.00000
## age yr specwgt tumdiam
## Min. : 0.000 Min. :1979 Min. : 10.0 Min. : 1.00
## 1st Qu.: 1.583 1st Qu.:1983 1st Qu.: 321.0 1st Qu.: 9.00
## Median : 3.083 Median :1987 Median : 510.0 Median :11.00
## Mean : 3.534 Mean :1987 Mean : 604.6 Mean :11.21
## 3rd Qu.: 4.833 3rd Qu.:1991 3rd Qu.: 800.0 3rd Qu.:14.00
## Max. :15.917 Max. :1995 Max. :4400.0 Max. :33.00
New variables
A linear spline in age
nwts$age1 <- with(nwts, pmin(age, 1))
nwts$age2 <- with(nwts, pmax(age, 1))
The full-cohort model
Histology:age interaction and stage:tumour-diameter interaction
fullmodel <- glm(relaps~histol*(age1+age2)+ I(stage>2)*tumdiam, family=binomial, data=nwts)
summary(fullmodel)
##
## Call:
## glm(formula = relaps ~ histol * (age1 + age2) + I(stage > 2) *
## tumdiam, family = binomial, data = nwts)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1478 -0.5989 -0.4811 -0.3955 2.4386
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.61048 0.34241 -7.624 2.46e-14 ***
## histol 5.66253 0.97790 5.790 7.02e-09 ***
## age1 -0.73020 0.34976 -2.088 0.036821 *
## age2 0.12198 0.01920 6.354 2.09e-10 ***
## I(stage > 2)TRUE 1.50690 0.29682 5.077 3.84e-07 ***
## tumdiam 0.07219 0.01591 4.539 5.66e-06 ***
## histol:age1 -4.19225 1.05085 -3.989 6.62e-05 ***
## histol:age2 -0.03064 0.04752 -0.645 0.519070
## I(stage > 2)TRUE:tumdiam -0.08947 0.02355 -3.798 0.000146 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3580.5 on 3914 degrees of freedom
## Residual deviance: 3229.3 on 3906 degrees of freedom
## AIC: 3247.3
##
## Number of Fisher Scoring iterations: 5
Case-control sample
Here we take all the cases and a random sample of controls
nwts$id <- 1:nrow(nwts)
cases <- subset(nwts, relaps==1)
noncases <- subset(nwts, relaps==0)
controlsample <- sample(noncases$id, nrow(cases))
ccsample<- rbind(cases, noncases[noncases$id %in% controlsample,])
ccsample$weight<-with(ccsample, ifelse(relaps==1, 1, nrow(noncases)/nrow(cases)))
We can compare the maximum likelihood estimator and the survey estimator:
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
ccmle <- glm(relaps~offset(log(weight))+histol*(age1+age2)+ I(stage>2)*tumdiam, family=binomial, data=ccsample)
summary(ccmle)
##
## Call:
## glm(formula = relaps ~ offset(log(weight)) + histol * (age1 +
## age2) + I(stage > 2) * tumdiam, family = binomial, data = ccsample)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6619 -1.2407 -0.2776 1.4364 2.3693
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.18981 0.43829 -4.996 5.85e-07 ***
## histol 4.46106 1.17825 3.786 0.000153 ***
## age1 -1.13596 0.45543 -2.494 0.012623 *
## age2 0.21786 0.02899 7.515 5.68e-14 ***
## I(stage > 2)TRUE 2.24581 0.43128 5.207 1.92e-07 ***
## tumdiam 0.08634 0.02093 4.125 3.71e-05 ***
## histol:age1 -2.08221 1.30854 -1.591 0.111553
## histol:age2 -0.09580 0.08115 -1.181 0.237765
## I(stage > 2)TRUE:tumdiam -0.12564 0.03408 -3.686 0.000228 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3114.9 on 1337 degrees of freedom
## Residual deviance: 2723.9 on 1329 degrees of freedom
## AIC: 2741.9
##
## Number of Fisher Scoring iterations: 5
survey_cc <- svydesign(id=~1, weights=~weight, strata=~relaps, data=ccsample)
ccest <- svyglm(relaps~histol*(age1+age2)+ I(stage>2)*tumdiam, family=quasibinomial, design=survey_cc)
summary(ccest)
##
## Call:
## svyglm(formula = relaps ~ histol * (age1 + age2) + I(stage >
## 2) * tumdiam, family = quasibinomial, design = survey_cc)
##
## Survey design:
## svydesign(id = ~1, weights = ~weight, strata = ~relaps, data = ccsample)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.52284 0.42664 -5.913 4.26e-09 ***
## histol 3.48903 1.03643 3.366 0.000783 ***
## age1 -0.84587 0.42832 -1.975 0.048492 *
## age2 0.14745 0.02757 5.348 1.05e-07 ***
## I(stage > 2)TRUE 1.58083 0.41423 3.816 0.000142 ***
## tumdiam 0.06201 0.02136 2.904 0.003750 **
## histol:age1 -1.87789 1.15740 -1.623 0.104933
## histol:age2 -0.05041 0.08116 -0.621 0.534633
## I(stage > 2)TRUE:tumdiam -0.08340 0.03295 -2.531 0.011494 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.017942)
##
## Number of Fisher Scoring iterations: 4
The two seem fairly comparable: we know that asymptotically the maximum likelihood estimator must be better, but the difference is small enough to not show up in a single comparison
round(cbind(coef(ccmle), coef(ccest))-coef(fullmodel),3)
## [,1] [,2]
## (Intercept) 0.421 0.088
## histol -1.201 -2.173
## age1 -0.406 -0.116
## age2 0.096 0.025
## I(stage > 2)TRUE 0.739 0.074
## tumdiam 0.014 -0.010
## histol:age1 2.110 2.314
## histol:age2 -0.065 -0.020
## I(stage > 2)TRUE:tumdiam -0.036 0.006
The simple survey estimator does not use the full cohort; we can declare a twophase
object that does. We do not need to specify weights because the software can work out what they are.
nwts_twophase <- twophase(id=list(~1,~1), strata=list(NULL, ~relaps), subset=~I((relaps==1)| id %in% controlsample), data=nwts)
twophaseest <- svyglm(relaps~histol*(age1+age2)+ I(stage>2)*tumdiam, family=quasibinomial, design=nwts_twophase)
summary(twophaseest)
##
## Call:
## svyglm(formula = relaps ~ histol * (age1 + age2) + I(stage >
## 2) * tumdiam, family = quasibinomial, design = nwts_twophase)
##
## Survey design:
## twophase2(id = id, strata = strata, probs = probs, fpc = fpc,
## subset = subset, data = data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.52284 0.42862 -5.886 5.01e-09 ***
## histol 3.48903 1.03619 3.367 0.000781 ***
## age1 -0.84587 0.42811 -1.976 0.048382 *
## age2 0.14745 0.02756 5.350 1.04e-07 ***
## I(stage > 2)TRUE 1.58083 0.41407 3.818 0.000141 ***
## tumdiam 0.06201 0.02135 2.905 0.003735 **
## histol:age1 -1.87789 1.15711 -1.623 0.104846
## histol:age2 -0.05041 0.08114 -0.621 0.534521
## I(stage > 2)TRUE:tumdiam -0.08340 0.03294 -2.532 0.011462 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.017942)
##
## Number of Fisher Scoring iterations: 4
We still aren’t using the whole cohort for anything, so the two analyses are almost identical
Using the whole cohort
We’ll try to use the whole cohort now. First, just use instit
instead of histol
First, fit the model to the full data
phase1model <- glm(relaps~instit*(age1+age2)+ I(stage>2)*tumdiam, family=binomial, data=nwts)
Extract the influence functions and create a new design object
inffun<-model.matrix(phase1model)*resid(phase1model, type="response")
colnames(inffun)<-paste0("if",1:ncol(inffun))
aug_twophase <- twophase(id=list(~1,~1), strata=list(NULL, ~relaps), subset=~I((relaps==1)| id %in% controlsample), data=cbind(nwts,inffun), method="simple")
Calibrate, and fit the model of interest (ie, with histol
) to the calibrated sample
calformula <- make.formula(colnames(inffun))
cal_twophase <- calibrate(aug_twophase, calformula, phase=2)
svyest_instit<-svyglm(relaps~histol*(age1+age2)+ I(stage>2)*tumdiam, family=quasibinomial, design=cal_twophase)
summary(svyest_instit)
##
## Call:
## svyglm(formula = relaps ~ histol * (age1 + age2) + I(stage >
## 2) * tumdiam, family = quasibinomial, design = cal_twophase)
##
## Survey design:
## calibrate(aug_twophase, calformula, phase = 2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.57714 0.35437 -7.272 6.01e-13 ***
## histol 5.08895 1.24088 4.101 4.36e-05 ***
## age1 -0.71401 0.34869 -2.048 0.040788 *
## age2 0.11668 0.02090 5.584 2.85e-08 ***
## I(stage > 2)TRUE 1.44579 0.31421 4.601 4.60e-06 ***
## tumdiam 0.06940 0.01674 4.146 3.60e-05 ***
## histol:age1 -3.83311 1.32028 -2.903 0.003754 **
## histol:age2 0.01252 0.05966 0.210 0.833801
## I(stage > 2)TRUE:tumdiam -0.08282 0.02488 -3.329 0.000895 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.011699)
##
## Number of Fisher Scoring iterations: 4
Comparing the uncalibrated and calibrated estimates, the coefficients have nearly all moved closer to the true full cohort value.
round(cbind(coef(twophaseest), coef(svyest_instit))-coef(fullmodel),3)
## [,1] [,2]
## (Intercept) 0.088 0.033
## histol -2.173 -0.574
## age1 -0.116 0.016
## age2 0.025 -0.005
## I(stage > 2)TRUE 0.074 -0.061
## tumdiam -0.010 -0.003
## histol:age1 2.314 0.359
## histol:age2 -0.020 0.043
## I(stage > 2)TRUE:tumdiam 0.006 0.007
Calibration by imputation
It is always valid to just use a surrogate such as instit
in calibration, but it is probably not optimal. The attenuation bias in using a mismeasured predictor translates into a loss of precision in the calibrated estimate. We can try to construct a regression imputation of histology instead:
impmodel<-svyglm(histol~instit*(relaps+I(stage>3))+I(age>10)+factor(study),family=quasibinomial,design=nwts_twophase)
nwts$imphistol <-as.vector(predict(impmodel,newdata=nwts,type="response",se.fit=FALSE))
with(nwts, by(imphistol, histol, summary))
## histol: 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01028 0.01536 0.02794 0.04257 0.02794 0.97556
## --------------------------------------------------------
## histol: 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01028 0.10718 0.83093 0.66956 0.95587 0.97556
We now proceed as before. In particular, note that it is important to use imphistol
for all observations in the phase-1 model, even those where histol
is available – it was not available at phase 1.
phase1model_imp <- glm(relaps~imphistol*(age1+age2)+ I(stage>2)*tumdiam, family=binomial, data=nwts)
Extract the influence functions and create a new design object
inffun_imp<-model.matrix(phase1model_imp)*resid(phase1model_imp, type="response")
colnames(inffun_imp)<-paste0("if",1:ncol(inffun_imp))
aug_twophase_imp <- twophase(id=list(~1,~1), strata=list(NULL, ~relaps), subset=~I((relaps==1)| id %in% controlsample), data=cbind(nwts,inffun_imp), method="simple")
Calibrate, and fit the model of interest
calformula <- make.formula(colnames(inffun_imp))
cal_twophase_imp <- calibrate(aug_twophase_imp, calformula, phase=2)
svyest_imp<-svyglm(relaps~histol*(age1+age2)+ I(stage>2)*tumdiam, family=quasibinomial, design=cal_twophase_imp)
summary(svyest_imp)
##
## Call:
## svyglm(formula = relaps ~ histol * (age1 + age2) + I(stage >
## 2) * tumdiam, family = quasibinomial, design = cal_twophase_imp)
##
## Survey design:
## calibrate(aug_twophase_imp, calformula, phase = 2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.56915 0.35474 -7.242 7.45e-13 ***
## histol 5.05091 1.27870 3.950 8.22e-05 ***
## age1 -0.74684 0.34810 -2.146 0.032093 *
## age2 0.11751 0.02094 5.611 2.44e-08 ***
## I(stage > 2)TRUE 1.49480 0.31871 4.690 3.01e-06 ***
## tumdiam 0.06985 0.01702 4.104 4.31e-05 ***
## histol:age1 -3.81602 1.36194 -2.802 0.005154 **
## histol:age2 0.03141 0.06202 0.507 0.612561
## I(stage > 2)TRUE:tumdiam -0.08516 0.02529 -3.367 0.000781 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.011895)
##
## Number of Fisher Scoring iterations: 4
There has been a slight additional improvement; slight, because instit
is overwhelming the best predictor of histology.
round(cbind(coef(twophaseest), coef(svyest_instit), coef(svyest_imp))-coef(fullmodel),3)
## [,1] [,2] [,3]
## (Intercept) 0.088 0.033 0.041
## histol -2.173 -0.574 -0.612
## age1 -0.116 0.016 -0.017
## age2 0.025 -0.005 -0.004
## I(stage > 2)TRUE 0.074 -0.061 -0.012
## tumdiam -0.010 -0.003 -0.002
## histol:age1 2.314 0.359 0.376
## histol:age2 -0.020 0.043 0.062
## I(stage > 2)TRUE:tumdiam 0.006 0.007 0.004
Multiple imputation
We can take multiple predictions from the fitted imputation model, produce influence functions for each of them, then average the influence functions over the multiple imputations. We need to use the full influence function, not just the score function as before, because the matrix it’s multiplied by can vary between imputations.
inffun_mi<-matrix(0,nrow=nrow(nwts),ncol=ncol(inffun_imp))
M<-50
for(i in 1:M){
nwts$mihistol <-rbinom(nrow(nwts),1, as.vector(predict(impmodel,newdata=nwts,type="response",se.fit=FALSE)))
phase1model_imp <- glm(relaps~mihistol*(age1+age2)+ I(stage>2)*tumdiam, family=binomial, data=nwts)
inffun_mi<-inffun_mi+(model.matrix(phase1model_imp)*resid(phase1model_imp, type="response"))%*%vcov(phase1model)
}
inffun_mi<-inffun_mi/M
colnames(inffun_mi)<-paste0("if",1:ncol(inffun_mi))
Now, do the calibration and fit the model of interest
aug_twophase_mi <- twophase(id=list(~1,~1), strata=list(NULL, ~relaps), subset=~I((relaps==1)| id %in% controlsample), data=cbind(nwts,inffun_mi), method="simple")
calformula <- make.formula(colnames(inffun_mi))
cal_twophase_mi <- calibrate(aug_twophase_mi, calformula, phase=2)
svyest_mi<-svyglm(relaps~histol*(age1+age2)+ I(stage>2)*tumdiam, family=quasibinomial, design=cal_twophase_mi)
summary(svyest_mi)
##
## Call:
## svyglm(formula = relaps ~ histol * (age1 + age2) + I(stage >
## 2) * tumdiam, family = quasibinomial, design = cal_twophase_mi)
##
## Survey design:
## calibrate(aug_twophase_mi, calformula, phase = 2)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.566843 0.356013 -7.210 9.36e-13 ***
## histol 4.770679 1.056824 4.514 6.92e-06 ***
## age1 -0.725544 0.349729 -2.075 0.03822 *
## age2 0.117367 0.020903 5.615 2.39e-08 ***
## I(stage > 2)TRUE 1.437374 0.312746 4.596 4.72e-06 ***
## tumdiam 0.068681 0.016790 4.091 4.56e-05 ***
## histol:age1 -3.466632 1.140661 -3.039 0.00242 **
## histol:age2 0.009168 0.059254 0.155 0.87706
## I(stage > 2)TRUE:tumdiam -0.081296 0.024763 -3.283 0.00105 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasibinomial family taken to be 1.012549)
##
## Number of Fisher Scoring iterations: 4
round(cbind(coef(twophaseest), coef(svyest_instit),coef(svyest_imp),coef(svyest_mi))-coef(fullmodel),3)
## [,1] [,2] [,3] [,4]
## (Intercept) 0.088 0.033 0.041 0.044
## histol -2.173 -0.574 -0.612 -0.892
## age1 -0.116 0.016 -0.017 0.005
## age2 0.025 -0.005 -0.004 -0.005
## I(stage > 2)TRUE 0.074 -0.061 -0.012 -0.070
## tumdiam -0.010 -0.003 -0.002 -0.004
## histol:age1 2.314 0.359 0.376 0.726
## histol:age2 -0.020 0.043 0.062 0.040
## I(stage > 2)TRUE:tumdiam 0.006 0.007 0.004 0.008
Simulation
Finally, what we really want for comparison purposes is a summary over repeated case-control samples. We run the entire procedure above 500 times.
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
The median difference from the full-cohort regression is
round(medianbias,3)
## MLE Weighted instit imp MI
## (Intercept) 1.660 -0.026 -0.002 -0.010 0.000
## histol -0.056 -0.025 0.430 0.352 0.266
## age1 0.005 -0.005 0.001 0.007 -0.001
## age2 0.000 0.002 -0.001 -0.001 0.000
## I(stage > 2)TRUE -0.117 0.029 0.010 0.017 0.006
## tumdiam -0.008 0.001 0.000 0.001 0.000
## histol:age1 0.007 -0.046 -0.339 -0.308 -0.218
## histol:age2 0.005 0.007 0.006 0.005 0.009
## I(stage > 2)TRUE:tumdiam 0.006 -0.003 -0.001 -0.001 -0.001
And the median absolute deviation (an outlier-resistant standard deviation) is
round(mad,3)
## MLE Weighted instit imp MI
## (Intercept) 0.230 0.233 0.032 0.044 0.038
## histol 1.426 1.529 0.784 0.861 1.236
## age1 0.272 0.275 0.032 0.036 0.038
## age2 0.017 0.018 0.005 0.005 0.005
## I(stage > 2)TRUE 0.248 0.274 0.076 0.090 0.070
## tumdiam 0.011 0.013 0.003 0.004 0.003
## histol:age1 1.511 1.587 0.910 0.920 1.248
## histol:age2 0.059 0.065 0.057 0.059 0.059
## I(stage > 2)TRUE:tumdiam 0.019 0.021 0.006 0.007 0.006