Introduction

library(survey)
library(sampling)
library(surveyCV)

First we define survey-weighted quantile functions:

  • q_wtd() is for standard quantiles
  • q_wtd_conf() is a “conformal” counterpart which pads the \((n+1)^{th}\) observation by its survey weight
# Survey-weighted quantile functions:
#   one for standard quantiles,
#   and one that is a "conformal" counterpart
#     which pads the n+1'th observation by its survey weight.
# Both of these expect y and w to be vectors of same length,
# but p to be a scalar (and same for w_test).

# y:      vector of numeric values from which you seek a quantile
#         (typically these will be nonconformity scores,
#          such as absolute residuals from the calibration set)
# p:      probability at which quantile is desired
# w:      vector of survey weights associated with each y-value
# w_test: survey weight associated with the test observation

q_wtd <- function(y, p, w = NULL) {
  if (is.null(w)) {w = rep(1, length(y))}
  w <- w[order(y)]
  y <- y[order(y)]
  q <- y[which.max(cumsum(w)/sum(w) >= p)]
  return(q)
}

q_wtd_conf <- function(y, p, w = NULL, w_test = NULL) {
  if (is.null(w)) {w = rep(1, length(y))}
  if (is.null(w_test)) {w_test = 1}
  w <- w[order(y)]
  y <- y[order(y)]
  # Append new w_test at end, with "obs" value of Inf
  w <- c(w, w_test)
  y <- c(y, Inf)
  q <- y[which.max(cumsum(w)/sum(w) >= p)]
  return(q)
}

We demonstrate the use of these quantile functions:

# The 75th percentile of y=1:4 is 3
q_wtd(y = 1:4, p = 0.75)
## [1] 3
# but the "conformal quantile" 75th percentile of y=1:4
# is actually the 75th percentile of the five numbers y=c(1:4, Inf),
# which is 4
q_wtd_conf(y = 1:4, p = 0.75)
## [1] 4
# Now say we have survey weights w=4:1 associated with y=1:4.
# Then the weighted 75th percentile is still 3,
# because (4+3)/(4+3+2+1)=0.7 is below 0.75,
#   but (4+3+2)/(4+3+2+1)=0.9 is above 0.75,
# so the 3rd-smallest y-value is the first whose cumulative weight
#   is equal to or greater than 0.75 of the sum of all weights
q_wtd(y = 1:4, p = 0.75, w = 4:1)
## [1] 3
# However, the conformal 75th percentile may be larger,
#   depending on the test-case weight:
# 1) if the test weight is small, like w_test=1, the 0.75 quantile is still 3
q_wtd_conf(y = 1:4, p = 0.75, w = 4:1, w_test = 1)
## [1] 3
# 2) if the test weight is larger, like w_test=3, the 0.75 quantile becomes 4
q_wtd_conf(y = 1:4, p = 0.75, w = 4:1, w_test = 3)
## [1] 4
# 3) and if the test weight is even larger, like w_test=10,
#    the 0.75 quantile becomes Infinity
q_wtd_conf(y = 1:4, p = 0.75, w = 4:1, w_test = 5)
## [1] Inf

Examples with apipop dataset

We will illustrate these methods using the apipop dataset from the survey package. This data contains Academic Performance Index (API) scores for the year 2000 for all California schools (elementary, middle, or high) with at least 100 students.

Our illustrations will use a simple model: predict each school’s API score from 2000, api00, using a linear regression on the explanatory variable meals, the percentage of students eligible for subsidized meals. Higher values of meals indicate the school tends to serve students from lower-income households, which tend to have lower api00 scores. This is clear in the graph. A population-level simple linear regression appears to fit reasonably well.

data(api)
head(apipop)
##              cds stype            name                      sname snum
## 1 01611190130229     H    Alameda High               Alameda High    1
## 2 01611190132878     H    Encinal High               Encinal High    2
## 3 01611196000004     M  Chipman Middle             Chipman Middle    3
## 4 01611196090005     E Lum (Donald D.) Lum (Donald D.) Elementary    4
## 5 01611196090013     E Edison Elementa          Edison Elementary    5
## 6 01611196090021     E Otis (Frank) El    Otis (Frank) Elementary    6
##                  dname dnum   cname cnum flag pcttest api00 api99 target growth
## 1 Alameda City Unified    6 Alameda    1   NA      96   731   693      5     38
## 2 Alameda City Unified    6 Alameda    1   NA      99   622   589     11     33
## 3 Alameda City Unified    6 Alameda    1   NA      99   622   572     11     50
## 4 Alameda City Unified    6 Alameda    1   NA      99   774   732      3     42
## 5 Alameda City Unified    6 Alameda    1   NA      99   811   784      1     27
## 6 Alameda City Unified    6 Alameda    1   NA     100   780   725      4     55
##   sch.wide comp.imp both awards meals ell yr.rnd mobility acs.k3 acs.46
## 1      Yes      Yes  Yes    Yes    14  16   <NA>        9     NA     NA
## 2      Yes       No   No     No    20  18   <NA>       13     NA     NA
## 3      Yes      Yes  Yes    Yes    55  25   <NA>       20     NA     26
## 4      Yes      Yes  Yes    Yes    35  26   <NA>       21     20     30
## 5      Yes      Yes  Yes    Yes    15   9   <NA>       11     20     29
## 6      Yes      Yes  Yes    Yes    25  18   <NA>       12     20     29
##   acs.core pct.resp not.hsg hsg some.col col.grad grad.sch avg.ed full emer
## 1       25       91       6  16       22       38       18   3.45   85   16
## 2       27       84      11  20       29       31        9   3.06   90   10
## 3       27       86      11  31       30       20        8   2.82   80   12
## 4       NA       96       3  22       29       31       15   3.32   96    4
## 5       NA       96       3   9       29       26       33   3.76   95    5
## 6       NA       87       6  11       28       41       13   3.44   90    5
##   enroll api.stu
## 1   1278    1090
## 2   1113     840
## 3    546     472
## 4    330     272
## 5    233     216
## 6    276     247
with(apipop, plot(meals, api00, col = "grey", las = 1, 
                  main = "apipop population data and trend line",
                  xlab = "meals (% of students eligible for subsidized meals)",
                  ylab = "api00 (school's Academic Performance Index in 2000)"))
abline(lm(api00 ~ meals, data = apipop), col = "grey50", lwd = 3)

lm(api00 ~ meals, data = apipop)
## 
## Call:
## lm(formula = api00 ~ meals, data = apipop)
## 
## Coefficients:
## (Intercept)        meals  
##      831.88        -3.48

SRSWOR

First, we illustrate the basic “split conformal inference” approach, without any special survey design features.

We will take a sample of 400 schools, then split it into 200 training and 200 calibration units. Then we will:

  • train a linear model on the training set;
  • make predictions and find absolute residuals on the calibration set;
  • find the 90% conformal quantile of these calibration nonconformity scores; and
  • report conformal 90% PIs for a test case (or, in our example, for the whole population).
set.seed(20240609)
# Take the full sample of 400 units
srs.df <- apipop[sample(nrow(apipop), 400), ]
# Split it into 200 training units and 200 calibration units
ii.train <- sample(400, 200)
srs.train <- srs.df[ii.train, ]
srs.calib <- srs.df[-ii.train, ]

# Fit a model to the training set
srs.lm.train <- lm(api00 ~ meals, data = srs.df)
srs.lm.train
## 
## Call:
## lm(formula = api00 ~ meals, data = srs.df)
## 
## Coefficients:
## (Intercept)        meals  
##     823.742       -3.436
# Compute the trained model's predictions (yhat)
#   for each school in the calibration set
srs.calib.yhat <- as.vector(predict(srs.lm.train, srs.calib))
# Then save the corresponding absolute residuals:
#   these are our nonconformity scores
srs.calib.absres <- abs(srs.calib$api00 - srs.calib.yhat)

# Now, find the 90% conformal quantile of these absolute residuals;
# this will be the 90% PI half-width or "margin of error" for our conformal PIs.
(srs.confMOE.90 <- q_wtd_conf(y = srs.calib.absres, p = 0.9))
## [1] 112.8019
hist(srs.calib.absres, 15)
abline(v = srs.confMOE.90, lty = 2, col = "red")

Finally, show our trained-model-plus/minus-PI on a plot of the full population:

# Finally, show our trained-model-plus/minus-PI on a plot of the full population
with(apipop, plot(meals, api00, col = "grey", las = 1, main = "SRSWOR"))
abline(lm(api00 ~ meals, data = apipop), col = "grey50", lwd = 3)
with(srs.train, points(meals, api00, col = "black", pch = 16))
abline(srs.lm.train, col = "black", lwd = 2, lty = 2)
with(srs.calib, points(meals, api00, col = "red", pch = 16))
abline(a = srs.lm.train$coefficients[1] - srs.confMOE.90,
       b = srs.lm.train$coefficients[2], col = "red", lwd = 2, lty = 2)
abline(a = srs.lm.train$coefficients[1] + srs.confMOE.90,
       b = srs.lm.train$coefficients[2], col = "red", lwd = 2, lty = 2)
legend("topright",
       legend = c("Population data and trend",
                  "Training sample and fitted trend",
                  "Calibration sample and PI band"),
       lty = c(1,2,2), col = c("grey", "black", "red"), pch = c(1, 16, 16))

PPS sampling

Now we repeat the same process as above, but using a PPS design with unequal sampling probabilities that are proportional to api.stu, the number of students tested at that school on the API.

Everything works just as before, except that when we calculate the conformal quantiles using q_wtd_conf(), we will also pass in (1) the sampling weights for all calibration residuals and (2) the sampling weight that each test case would have had, if it had been sampled.

However, we no longer have a single conformal 90% PI band for the whole population, because each test case has its own sampling weight. For illustration purposes here, we will find the 90% PI widths for the smallest and largest sampling weights. In practice, you would use the appropriate sampling weights for each desired test case; or you could use the largest sampling weight to get a conservative PI band for the whole population.

Tibshirani et al. provide their own R code on GitHub for weighted conformal methods, but unfortunately they do not allow use of the weights in the model-fitting step, so they are not appropriate for design-based inference.

set.seed(20240609)
# Take the full sample of 400 units,
# with probability proportional to api.stu
pps.df <- apipop[sample(nrow(apipop), 400, prob = apipop$api.stu), ]

# Split it into 200 training units and 200 calibration units,
# using surveyCV::folds.svy()
folds.pps <- surveyCV::folds.svy(pps.df, nfolds = 2)

# Save the training set as a survey design object, then fit a svyglm() model
pps.design.train <- svydesign(id = ~1, data = pps.df, probs = ~api.stu) |> 
  subset(folds.pps == 1)
pps.lm.train <- svyglm(api00 ~ meals, design = pps.design.train)
pps.lm.train
## Independent Sampling design (with replacement)
## subset(svydesign(id = ~1, data = pps.df, probs = ~api.stu), folds.pps == 
##     1)
## 
## Call:  svyglm(formula = api00 ~ meals, design = pps.design.train)
## 
## Coefficients:
## (Intercept)        meals  
##     837.524       -3.475  
## 
## Degrees of Freedom: 199 Total (i.e. Null);  198 Residual
## Null Deviance:       3359000 
## Residual Deviance: 1018000   AIC: 2325
# Save the calibration set as a dataframe
pps.calib <- pps.df[folds.pps == 2, ]


# Compute the trained model's predictions (yhat)
#   for each school in the calibration set
pps.calib.yhat <- as.vector(predict(pps.lm.train, pps.calib))
# Then save the corresponding absolute residuals:
#   these are our nonconformity scores
pps.calib.absres <- abs(pps.calib$api00 - pps.calib.yhat)

# Now, find the 90% conformal quantile of these absolute residuals;
# this will be the 90% PI half-width or "margin of error" for our conformal PIs.
# However, this time (1) all the calibration residuals are weighted,
# and (2) the test cases each have their own weights too...
# so for brevity, we just show the MOE for the smallest and largest test weights.
(pps.confMOE.90.low  <- q_wtd_conf(y = pps.calib.absres, p = 0.9,
                                   w = 1/pps.calib$api.stu,
                                   w_test = 1/max(apipop$api.stu)))
## [1] 112.1503
(pps.confMOE.90.high <- q_wtd_conf(y = pps.calib.absres, p = 0.9,
                                   w = 1/pps.calib$api.stu,
                                   w_test = 1/min(apipop$api.stu)))
## [1] 121.0974

Finally, show our trained-model-plus/minus-PI on a plot of the full population:

# Finally, show our trained-model-plus/minus-PI on a plot of the full population
with(apipop, plot(meals, api00, col = "grey", las = 1, main = "PPS Sampling"))
abline(lm(api00 ~ meals, data = apipop), col = "grey50", lwd = 3)
with(pps.design.train$variables, points(meals, api00, col = "black", pch = 16))
abline(pps.lm.train, col = "black", lwd = 2, lty = 2)
with(pps.calib, points(meals, api00, col = "red", pch = 16, cex = api.stu/max(api.stu)*3))
abline(a = pps.lm.train$coefficients[1] - pps.confMOE.90.low,
       b = pps.lm.train$coefficients[2], col = "orange", lwd = 2, lty = 2)
abline(a = pps.lm.train$coefficients[1] + pps.confMOE.90.low,
       b = pps.lm.train$coefficients[2], col = "orange", lwd = 2, lty = 2)
abline(a = pps.lm.train$coefficients[1] - pps.confMOE.90.high,
       b = pps.lm.train$coefficients[2], col = "red4", lwd = 2, lty = 2)
abline(a = pps.lm.train$coefficients[1] + pps.confMOE.90.high,
       b = pps.lm.train$coefficients[2], col = "red4", lwd = 2, lty = 2)
legend("topright",
       legend = c("Population data and trend",
                  "Training sample and fitted trend",
                  "Calibration sample, sized by samp. weight",
                  "PI band for min test weight",
                  "PI band for max test weight"),
       lty = c(1,2,NA,2,2),
       col = c("grey", "black", "red", "orange", "red4"),
       pch = c(1, 16, 16, NA, NA))

Stratified sampling

For stratified sampling, we treat each stratum as its own population and carry out conformal inference separately within each stratum.

We illustrate here by treating elementary, middle, and high schools as 3 strata.

For split conformal inference, we can train a model on the proper training set using survey regression techniques that account for stratification. We only separate the strata when using the calibration set to find different conformal quantiles in each stratum. (Alternately, we could train separate models within each stratum, depending on the purposes of the study.)

set.seed(20240609)
# How many schools of each type are there? (elementary, high, middle)
table(apipop$stype)
## 
##    E    H    M 
## 4421  755 1018
stratSizesPop <- as.vector(table(apipop$stype))

# Take a self-weighting stratified sample of 400 units:
(stratSizesPop/sum(stratSizesPop)*400) |> round(1)
## [1] 285.5  48.8  65.7
# 285 elementary, 49 high, and 66 middle.
stratSizesSample <- c(285, 49, 66)


# Take a stratified sample of 400 units: 200 elementary, 100 middle, and 100 high.
# Using sampling::strata() which requires the data to be sorted by stratum...
apipop_sort <- apipop[order(apipop$stype), ]
strat.df <- sampling::getdata(apipop_sort,
                    sampling::strata(apipop_sort, stratanames = "stype",
                        size = stratSizesSample, method = "srswor"))

# Split it into 200 training units and 200 calibration units,
# using surveyCV::folds.svy() which will respect the stratified structure
# (splitting into 2 folds independently within each stratum)
folds.strat <- surveyCV::folds.svy(strat.df, nfolds = 2, strataID = "stype")

# Save the training set as a survey design object, then fit a svyglm() model
strat.design.train <- svydesign(id = ~1, data = strat.df, strata = ~stype) |> 
  subset(folds.strat == 1)
## Warning in svydesign.default(id = ~1, data = strat.df, strata = ~stype): No
## weights or probabilities supplied, assuming equal probability
strat.lm.train <- svyglm(api00 ~ meals, design = strat.design.train)
strat.lm.train
## Stratified Independent Sampling design (with replacement)
## subset(svydesign(id = ~1, data = strat.df, strata = ~stype), 
##     folds.strat == 1)
## 
## Call:  svyglm(formula = api00 ~ meals, design = strat.design.train)
## 
## Coefficients:
## (Intercept)        meals  
##     839.355       -3.625  
## 
## Degrees of Freedom: 199 Total (i.e. Null);  196 Residual
## Null Deviance:       3561000 
## Residual Deviance: 1120000   AIC: 2300
# Save the calibration set as a dataframe
strat.calib <- strat.df[folds.strat == 2, ]



## SEPARATELY BY STRATUM:
# Make predictions on the calibration data for that stratum,
# find nonconformity scores,
# and compute their conformal quantile.
ss.confMOE.90 <- rep(NA, 3)
for(ss in 1:3) {
  stratum <- c("E", "M", "H")[ss]
  ss.calib <- subset(strat.calib, stype == stratum)
  # Compute the trained model's predictions (yhat)
  #   for each school in the subsample from the calibration set
  ss.calib.yhat <- as.vector(predict(strat.lm.train, ss.calib))
  # Then save the corresponding absolute residuals:
  #   these are our nonconformity scores
  ss.calib.absres <- abs(ss.calib$api00 - ss.calib.yhat)
  # Now, find the 90% conformal quantile of these absolute residuals;
  # this will be the 90% PI half-width or "margin of error" for our conformal PIs
  # IN THIS STRATUM.
  ss.confMOE.90[ss] <- q_wtd_conf(y = ss.calib.absres, p = 0.9)
}
ss.confMOE.90
## [1]  97.1433 100.5030 156.4872

Finally, show our trained-model-plus/minus-PI on a plot of the full population:

# Finally, show our trained-model-plus/minus-PI on a plot of the full population
with(apipop, plot(meals, api00, col = "grey", las = 1, main = "Stratified Sampling"))
abline(lm(api00 ~ meals, data = apipop), col = "grey50", lwd = 3)
with(strat.design.train$variables, points(meals, api00, col = "black", pch = 16))
abline(strat.lm.train, col = "black", lwd = 2, lty = 2)

with(subset(strat.calib, stype == "E"), points(meals, api00, col = "red", pch = "E"))
abline(a = strat.lm.train$coefficients[1] - ss.confMOE.90[1],
       b = strat.lm.train$coefficients[2], col = "red", lwd = 2, lty = 2)
abline(a = strat.lm.train$coefficients[1] + ss.confMOE.90[1],
       b = strat.lm.train$coefficients[2], col = "red", lwd = 2, lty = 2)

with(subset(strat.calib, stype == "M"), points(meals, api00, col = "blue", pch = "M"))
abline(a = strat.lm.train$coefficients[1] - ss.confMOE.90[2],
       b = strat.lm.train$coefficients[2], col = "blue", lwd = 2, lty = 2)
abline(a = strat.lm.train$coefficients[1] + ss.confMOE.90[2],
       b = strat.lm.train$coefficients[2], col = "blue", lwd = 2, lty = 2)

with(subset(strat.calib, stype == "H"), points(meals, api00, col = "purple", pch = "H"))
abline(a = strat.lm.train$coefficients[1] - ss.confMOE.90[3],
       b = strat.lm.train$coefficients[2], col = "purple", lwd = 2, lty = 2)
abline(a = strat.lm.train$coefficients[1] + ss.confMOE.90[3],
       b = strat.lm.train$coefficients[2], col = "purple", lwd = 2, lty = 2)

legend("topright",
       legend = c("Population data and trend",
                  "Training sample and fitted trend",
                  "Elementary calib. sample and PI band",
                  "Middle-school calib. sample and PI band",
                  "High-school calib. sample and PI band"),
       lty = c(1,2,2,2,2),
       col = c("grey", "black", "red", "blue", "purple"),
       pch = c(1, 16, 69, 77, 72))

Cluster sampling

For equal-probability cluster sampling, we illustrate Dunn’s “subsampling once” method. For each PSU in the calibration set, sample one ultimate unit (school) from that PSU uniformly at random.

This is a simple (although statistically inefficient) way to ensure exchangeability among the calibration and test cases. After subsampling once, each remaining school is exchangeable with the others, and also with the test case if we assume the test case is also chosen by the method “randomly sample one PSU, then randomly select one unit from the chosen PSU.” Thanks to this exchangeability, we can use standard conformal methods on the subsampled calibration residuals.

Dunn et al. provide R code on GitHub for other conformal methods for hierarchical data, but it is not yet clear how best to apply them with design-based cluster sampling.

# How many school districts (PSUs) are there?
length(unique(apipop$dnum))
## [1] 757
# Aiming for around n=400 ultimate units again,
# let us sample 33 PSUs for model fitting,
# then a calibration set of another 33 PSUs.

# Equivalently, we will instead sample 66 PSUs, then split into 2 folds using `surveyCV`
# (since that is more similar to real survey practice:
#  we would collect the whole dataset first,
#  then split it into training vs calibration *after* data collection)

set.seed(20240609)
clus.PSUs <- sample(unique(apipop$dnum), size = 66)
clus.df <- subset(apipop, dnum %in% clus.PSUs)
# Now we have sampled 66 PSUs, containing a total of 404 schools
length(unique(clus.df$dnum))
## [1] 66
nrow(clus.df)
## [1] 404
# We split them into two folds: training and calibration,
# using surveyCV::folds.svy() which will respect the cluster-sample structure
# (splitting into folds at the PSU level, not ultimate unit level,
#  so that all schools from the same PSU are in the same fold)
folds.clus <- surveyCV::folds.svy(clus.df, nfolds = 2, clusterID = "dnum")

# Save the training set as a survey design object, then fit a svyglm() model
clus.design.train <- svydesign(id = ~dnum, data = clus.df) |> 
  subset(folds.clus == 1)
## Warning in svydesign.default(id = ~dnum, data = clus.df): No weights or
## probabilities supplied, assuming equal probability
clus.lm.train <- svyglm(api00 ~ meals, design = clus.design.train)
clus.lm.train
## 1 - level Cluster Sampling design (with replacement)
## With (33) clusters.
## subset(svydesign(id = ~dnum, data = clus.df), folds.clus == 1)
## 
## Call:  svyglm(formula = api00 ~ meals, design = clus.design.train)
## 
## Coefficients:
## (Intercept)        meals  
##     813.144       -3.145  
## 
## Degrees of Freedom: 164 Total (i.e. Null);  31 Residual
## Null Deviance:       2042000 
## Residual Deviance: 827100    AIC: 1880
# Save the calibration set as a dataframe
clus.df.calib <- clus.df[folds.clus == 2, ]
# Subsample to one unit per PSU:
# Split by cluster ID (dnum or school district number).
# Then sample one row from each cluster,
#   which creates a list of single-row dataframes.
# Finally, collapse the list into one big dataframe.
clus.df.subsample <-
  split(clus.df.calib, f = ~dnum) |> 
  lapply(FUN = function(df) df[sample(nrow(df),1), ]) |> 
  do.call(what = "rbind")

# Compute the trained model's predictions (yhat)
#   for each school in the subsample from the calibration set
clus.calib.yhat <- as.vector(predict(clus.lm.train, clus.df.subsample))
# Then save the corresponding absolute residuals:
#   these are our nonconformity scores
clus.calib.absres <- abs(clus.df.subsample$api00 - clus.calib.yhat)

# Now, find the 90% conformal quantile of these absolute residuals;
# this will be the 90% PI half-width or "margin of error" for our conformal PIs.
(clus.confMOE.90 <- q_wtd_conf(y = clus.calib.absres, p = 0.9))
## [1] 143.4148
hist(clus.calib.absres, 15)
abline(v = clus.confMOE.90, lty = 2, col = "red")

Finally, show our trained-model-plus/minus-PI on a plot of the full population:

# Finally, show our trained-model-plus/minus-PI on a plot of the full population
with(apipop, plot(meals, api00, col = "grey", las = 1, main = "Cluster Sampling"))
abline(lm(api00 ~ meals, data = apipop), col = "grey50", lwd = 3)
with(clus.design.train$variables, points(meals, api00, col = "black", pch = 16))
abline(clus.lm.train, col = "black", lwd = 2, lty = 2)
with(clus.df.calib, points(meals, api00, col = "maroon1", pch = 16))
with(clus.df.subsample, points(meals, api00, col = "red", pch = 16, cex = 2))
abline(a = clus.lm.train$coefficients[1] - clus.confMOE.90,
       b = clus.lm.train$coefficients[2], col = "red", lwd = 2, lty = 2)
abline(a = clus.lm.train$coefficients[1] + clus.confMOE.90,
       b = clus.lm.train$coefficients[2], col = "red", lwd = 2, lty = 2)
legend("topright",
       legend = c("Population data and trend",
                  "Training sample and fitted trend",
                  "Calibration sample",
                  "Calibration subsample and PI band"),
       lty = c(1,2,NA,2), col = c("grey", "black", "maroon1", "red"), pch = c(1, 16, 16, 16))