Below, we discuss some R code for illustrating how to implement the nonparametric Bayes estimator for the covariate-adjusted Youden index and corresponding optimal cutoff, as proposed by Inácio de Carvalho, de Carvalho, and Branscum (2017); for the interpretation of the quantities computed in this script, the reader is referred to the article. Before running the code chunks below, start by cleaning workspace and install the following packages (if not installed).

rm(list = ls())
if (!require("splines")) install.packages("splines")
if (!require("Hmisc")) install.packages("Hmisc")
if (!require("MASS")) install.packages("MASS")

For reproducibility reasons, we fix setseed and list below the information about R, the OS, and loaded packages:

set.seed(1)
sessionInfo()
## R version 3.3.2 (2016-10-31)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] splines   stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] MASS_7.3-45     Hmisc_3.17-4    ggplot2_2.1.0   Formula_1.2-1  
## [5] survival_2.39-5 lattice_0.20-34
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.5         cluster_2.0.5       knitr_1.15         
##  [4] magrittr_1.5        munsell_0.4.3       colorspace_1.2-6   
##  [7] stringr_1.0.0       plyr_1.8.4          tools_3.3.2        
## [10] nnet_7.3-12         grid_3.3.2          data.table_1.9.6   
## [13] gtable_0.2.0        latticeExtra_0.6-28 htmltools_0.3.5    
## [16] yaml_2.1.13         assertthat_0.1      digest_0.6.9       
## [19] tibble_1.2          Matrix_1.2-7.1      gridExtra_2.2.1    
## [22] RColorBrewer_1.1-2  acepack_1.3-3.3     rpart_4.1-10       
## [25] evaluate_0.10       rmarkdown_1.1       stringi_1.1.1      
## [28] scales_0.4.0        chron_2.3-47        foreign_0.8-67

In the code chunks below, we follow the 80 characters per line standard.

Workhorse bsplinesddp Function

The key function for fitting the covariate-adjusted Youden index as in Inácio de Carvalho, de Carvalho, and Branscum (2017) is:

bsplinesddp <- function(y, x, grid, xpred, m, S, nu, psi, atau, btau, 
                        alpha, L, nsim, knots) {
  yt <- y / sd(y)
  n <- length(y)
  ngrid <- length(grid)
  npred <- length(xpred)
  X <- bs(x, degree = 3, knots = knots, intercept = TRUE)
  k <- ncol(X)
  Xpred <- predict(bs(x, degree = 3, knots = knots, intercept = TRUE), 
                   xpred)
  
  p <- ns <- rep(0, L)
  v <- rep(1 / L, L)
  v[L] <- 1
  
  beta <- matrix(0, nrow = L, ncol = k)
  tau <- rep(1 / var(yt), L)
  
  prop <- prob <- matrix(0, nrow = n, ncol = L)
  
  P <- Tau <- Sigma2 <- matrix(0, nrow = nsim, ncol = L)
  Beta <- Beta1 <- array(0, c(nsim, L, k))
  Beta[1, , ] <- beta 
  Tau[1, ] <- tau
  
  mu <- matrix(0, nrow = nsim, ncol = k)
  Sigmainv <- array(0, c(nsim, k, k))
  mu[1, ] <- mvrnorm(1, mu = m, Sigma = S)
  Sigmainv[1, , ] <- rWishart(1, df = nu, solve(nu * psi))
  
  Dens <- array(0, c(nsim, ngrid, L, npred)) 
  Densm <- array(0, c(nsim, ngrid, npred)) 
  Fdist <- array(0, c(nsim, ngrid, L, npred)) 
  Fdistm <- array(0, c(nsim, ngrid, npred))
  
  ## 1) ALLOCATE EACH OBSERVATION TO A COMPONENT MIXTURE
  for(i in 2:nsim) { 
    cumv <- cumprod(1 - v)
  
    p[1] <- v[1] 
    for(l in 2:L)
      p[l] <- v[l] * cumv[l - 1]
    for(l in 1:L)
      prop[, l] <- p[l] * dnorm(yt, mean = X %*% beta[l, ], 
                                sd = sqrt(1 / tau[l]))
    prob <- prop / apply(prop, 1, sum)
    z <- rMultinom(prob, 1)
    P[i, ] <- p
    
    for(l in 1:L) 
      ns[l] <- length(which(z == l))

    ## 2) UPDATE STICK-BREAKING WEIGHTS
    for(l in 1:(L - 1)) 
      v[l] <- rbeta(1, 1 + ns[l], alpha + sum(ns[(l + 1):L]))
  
    ## 3) UPDATE PARAMETERS OF EACH COMPONENT MIXTURE
    for(l in 1:L) {
      tX <- matrix(t(X[z == l, ]), nrow = k, ncol = ns[l])
      V <- solve(Sigmainv[i - 1, ,] + tau[l] * tX %*% X[z == l, ])  
      mu1 <- V %*% (Sigmainv[i - 1, , ] %*% mu[i - 1, ] + tau[l] * 
                    tX %*% yt[z == l])
      Beta[i, l, ] <- beta[l,] <- mvrnorm(1, mu = mu1, Sigma = V)
      Beta1[i, l, ] <- sd(y) * Beta[i, l, ]
      
      Tau[i, l] <- tau[l] <- rgamma(1, shape = atau + (ns[l] / 2), 
                                   rate = btau + 0.5 * (t(yt[z == l] - 
                                   X[z == l, ] %*% beta[l, ]) %*% 
                                   (yt[z == l] - X[z == l, ] 
                                   %*% beta[l, ])))
      Sigma2[i, l] <- var(y) * (1 / Tau[i, l])
    }
  
    Vaux <- solve(solve(S) + L * Sigmainv[i - 1, , ])
    meanmu <- Vaux %*% (solve(S) %*% m + Sigmainv[i - 1, , ] %*% 
                     t(t(apply(Beta[i, , ], 2, sum))))
    mu[i, ] <- mvrnorm(1, mu = meanmu, Sigma = Vaux)
    
    Vaux1 <- 0
    for(l in 1:L) 
      Vaux1 <- Vaux1 + (Beta[i, l, ] - mu[i, ]) %*% 
               t((Beta[i, l, ] - mu[i, ]))  
    Sigmainv[i, , ] <- rWishart(1, nu + L, solve(nu * psi + Vaux1))
  
    ## 4) COMPUTE DENSITY AND DISTRIBUTION FUNCTION TRAJECTORIES
    for(l in 1:L) {
      for(j in 1:npred) {
        Dens[i, ,l , j] <- P[i, l] * dnorm(grid, Xpred[j, ] %*% 
                                        Beta1[i, l, ], 
                                        sqrt(Sigma2[i, l]))
        Fdist[i, , l, j] <- P[i, l] * pnorm(grid, Xpred[j, ] %*% 
                                            Beta1[i, l, ], 
                                            sqrt(Sigma2[i, l]))
      }
    }
    
    for(j in 1:ngrid) {
      for(l in 1:npred) {
        Densm[i, j, l] <- sum(Dens[i, j, , l])
        Fdistm[i, j, l] <- sum(Fdist[i, j, , l])  
      }
    }
  }
  return(list(P, Beta1, Sigma2, Densm, Fdistm))
}

The arguments of the bsplinesddp function are self-explanatory from the article. To analyze the diabetes data we proceed as follows:

## setwd("Add your working directory here")
load("diabetes.Rdata")
ind0 <- which(diabetes[, 2] == 0) 
ind1 <- which(diabetes[, 2] == 1) 
n0 <- length(ind0) 
n1 <- length(ind1)
y0 <- diabetes[ind0, 1] 
y1 <- diabetes[ind1, 1] 
x0 <- diabetes[ind0, 3] 
x1 <- diabetes[ind1, 3]
var0 <- var1 <- 1

knots0 <- c()
knots1 <- c() 
nk <- length(knots0)

Next, we apply the workhorse function bsplinesddp to the pair (glucose levels, age) for diseased (x1, y1) and non-diseased subjects (x0, y0):

res0np <- bsplinesddp(y = y0, x = x0, grid = seq(50, 500, len = 200), 
                      xpred = seq(32, 76, by = 2), m = rep(0, nk + 4), 
                      S = 100 * diag(nk + 4), nu = nk + 6, 
                      psi = diag(nk + 4), atau = 0.1, btau = 0.1, alpha = 1, 
                      L = 20, nsim = 5000, knots = knots0)
res1np <- bsplinesddp(y = y1, x = x1, grid = seq(50, 500, len = 200), 
                      xpred = seq(32, 76, by = 2), m = rep(0, nk + 4), 
                      S = 100 * diag(nk + 4), nu = nk + 6, psi = diag(nk + 4), 
                      atau = 0.1, btau = 0.1, alpha = 1, L = 20, nsim = 5000, 
                      knots = knots1)

We compute the covariate-adjusted optimal cutoff and corresponding Youden index as follows:

grid <- seq(50, 500, len = 200)
xpred <- seq(32, 76, by = 2)
ngrid <- length(grid)
npred <- length(xpred)
nsim <- 5000
nburn <- 1500

difcnp <- array(0, c(nsim - nburn, ngrid, npred))
for(k in 1:npred)
  for(j in 1:ngrid)
  difcnp[, j, k] <- res0np[[5]][(nburn + 1):nsim, j, k] - 
                    res1np[[5]][(nburn + 1):nsim, j, k]

coptcnp <- matrix(0, nrow = nsim - nburn, ncol = npred)
for(k in 1:npred)
  for(j in 1:(nsim - nburn))
    coptcnp[j, k] = mean(grid[which(difcnp[j, , k] == max(difcnp[j, , k]))])

coptcrnp <- matrix(nrow = npred, ncol = 3)
for(j in 1:npred) {
  coptcrnp[j, 1] <- quantile(coptcnp[, j], 0.025)  
  coptcrnp[j, 2] <- mean(coptcnp[, j])  
  coptcrnp[j, 3] <- quantile(coptcnp[, j], 0.975) 
}

yicnp <- matrix(0, nrow = nsim - nburn, ncol = npred)
for(k in 1:npred)
  for(j in 1:(nsim - nburn))
    yicnp[j,k] <- max(difcnp[j, , k])

yicrnp <- matrix(nrow = npred,ncol = 3)
for(j in 1:npred) {
  yicrnp[j, 1] <- quantile(yicnp[, j], 0.025)  
  yicrnp[j, 2] <- mean(yicnp[, j])  
  yicrnp[j, 3] <- quantile(yicnp[, j], 0.975) 
}

Finally, we report the results:

par(mfrow = c(1, 2))
plot(xpred, coptcrnp[, 2], type = "l", xlim = c(32, 76), ylim = c(100, 200), 
     xlab = "Age", ylab = "Optimal cutoff", lwd = 2)
polygon(x = c(rev(xpred), xpred), y = c(rev(coptcrnp[, 1]), coptcrnp[, 3]),
        border = NA, col = "lightgray")
lines(xpred, coptcrnp[, 1], lwd = 0.3)
lines(xpred, coptcrnp[, 2], lwd = 2)
lines(xpred, coptcrnp[, 3], lwd = 0.3)

plot(xpred, yicrnp[, 2], type = "l", xlim = c(32, 76), ylim = c(0, 1), 
     xlab = "Age", ylab = "Youden index", lwd = 2)
polygon(x = c(rev(xpred), xpred), y = c(rev(yicrnp[, 1]), yicrnp[, 3]),
        border = NA, col = "lightgray")
lines(xpred, coptcrnp[, 1], lwd = 0.3)
lines(xpred, yicrnp[, 2], lwd = 2)
lines(xpred, yicrnp[, 3], lwd = 0.3)
Fig. 1: Covariate-adjusted optimal cutoff and corresponding Youden index

Fig. 1: Covariate-adjusted optimal cutoff and corresponding Youden index

The Log PseudoMarginal Likelihood (LPML) was computed as follows:

L <- 20
X0 <- bs(x0, degree = 3, knots = knots0, intercept = TRUE)

term0 <- array(0, c(nsim - nburn, L, n0))
for(i in 1:n0)
  for(k in 1:(nsim-nburn))
    for(l in 1:L)
      term0[k, l, i] <- res0np[[1]][k + nburn,l] * 
                        dnorm(y0[i], mean = X0[i,] %*% 
                              res0np[[2]][k + nburn, l,], 
                              sd = sqrt(res0np[[3]][k + nburn, l]))  
  
termsum0 <- matrix(0, nrow = nsim - nburn, ncol = n0)
for(i in 1:n0)
  for(k in 1:(nsim-nburn))
    termsum0[k, i] <- sum(term0[k, , i])  

cpoinv0 <- numeric(n0)
for(i in 1:n0)
  cpoinv0[i] <- mean(1 / termsum0[, i])  

cpo0 <- 1/cpoinv0
lpml0 <- sum(log(cpo0))

And we thus get that the value of lpml0 is -878.18.


References

Inácio de Carvalho, V., de Carvalho, M., and Branscum, A. (2017), “Nonparametric Bayesian Regression Analysis of the Youden Index,” Biometrics, 73, 1279–1288.