# modified version of CoxBoost in order to allow to use an offset. # will be included in the next version of CoxBoost (communication from H. Binder) efron.weightmat <- function(time,status) { n <- length(time) uncens <- which(status == 1) weightmat <- matrix(0,n,length(uncens)) rept <- rep(0,n) for (i in 1:n) rept[i] <- sum(time[i:n]==time[i] & status[i:n]==1) for (i in 1:length(uncens)) { weightmat[time >= time[uncens][i] | (status != 0 & status != 1), i] <- 1 tie <- time == time[uncens[i]] & status==1 di <- max(rept[tie]) weightmat[tie, i] <- weightmat[tie, i] - (di - rept[uncens[i]])/di } # check for competing risks scenario if (any(status != 0 & status != 1)) { cens.ind <- ifelse(status == 0,1,0) invcensprob <- rep(NA,length(time)) invcensprob[order(time)] <- summary(survival::survfit(Surv(time,cens.ind) ~ 1),times=sort(time))$surv for (i in 1:length(uncens)) { current.invcens <- invcensprob[uncens][i] weightmat[,i] <- weightmat[,i] * current.invcens/ifelse(time < time[uncens][i],invcensprob,current.invcens) } } weightmat } CoxBoostMod <- function(time,status,x,unpen.index=NULL,standardize=TRUE,subset=1:length(time), weights=NULL,stepno=100,penalty=9*sum(status[subset]==1), criterion=c("pscore","score","hpscore","hscore"), stepsize.factor=1,sf.scheme=c("sigmoid","linear"),pendistmat=NULL,connected.index=NULL, x.is.01=FALSE,return.score=TRUE,trace=FALSE, offset=NULL) { sf.scheme <- match.arg(sf.scheme) criterion <- match.arg(criterion) if (any(is.na(x))) { stop("'x' may not contain missing values") } object <- list() # reduce response to subset time <- time[subset] status <- status[subset] # reorder observations according to time to speed up computations object$time <- time object$status <- status time.order <- order(time,decreasing=TRUE) subset.time.order <- (1:nrow(x))[subset][time.order] reverse.time.order <- match(seq(along=time),time.order) status <- status[time.order] time <- time[time.order] if (!is.null(weights)) weights <- weights[subset][time.order] object$stepno <- stepno object$unpen.index <- unpen.index pen.index <- 1:ncol(x) if (!is.null(unpen.index)) pen.index <- pen.index[-unpen.index] if (length(penalty) < length(pen.index)) penalty <- rep(penalty[1],length(pen.index)) if (any(stepsize.factor != 1)) { object$penalty <- matrix(NA,stepno,length(penalty)) } else { object$penalty <- penalty } if (is.null(colnames(x))) { object$xnames <- paste("V",1:ncol(x),sep="") } else { object$xnames <- colnames(x) } if (!is.null(connected.index) && any(connected.index %in% unpen.index)) { stop("sets of unpenalized and connected covariates may not overlap") } if (!is.null(pendistmat) && is.null(connected.index)) { if (ncol(pendistmat) == ncol(x) - length(unpen.index)) { if (!is.null(unpen.index)) { connected.index <- (1:ncol(x))[-unpen.index] } else { connected.index <- 1:ncol(x) } } else { stop("'connected.index' is missing and cannot be guessed") } } if (!is.null(unpen.index)) { if (!is.null(connected.index)) connected.index <- match(connected.index,(1:ncol(x))[-unpen.index]) unpen.x <- x[subset.time.order,unpen.index,drop=FALSE] } uncens <- which(status == 1) n <- length(status) n.uncens <- length(uncens) p <- length(pen.index) penpos <- match(1:p,connected.index) object$n <- n object$p <- p object$event.times <- sort(unique(time[uncens])) object$coefficients <- Matrix(0,stepno+1,p) if (!is.null(unpen.index)) { unpen.coefficients <- matrix(NA,stepno+1,ncol(unpen.x)) } else { unpen.coefficients <- NULL } object$linear.predictor <- matrix(NA,stepno+1,n) object$meanx <- rep(0,length(object$xnames)) object$sdx <- rep(1,length(object$xnames)) if (standardize) { pen.sdx <- apply(x[subset,pen.index],2,sd) pen.sdx <- ifelse(pen.sdx == 0,1,pen.sdx) pen.meanx <- apply(x[subset,pen.index],2,mean) x[subset,pen.index] <- scale(x[subset,pen.index],center=pen.meanx,scale=pen.sdx) object$meanx[pen.index] <- pen.meanx object$sdx[pen.index] <- pen.sdx object$standardize <- TRUE } else { object$standardize <- FALSE } object$Lambda <- matrix(NA,stepno+1,length(object$event.times)) if (return.score) object$scoremat <- matrix(NA,max(1,stepno),object$p) # Efron handling of ties weightmat <- efron.weightmat(time,status) if (!is.null(weights)) weightmat <- weightmat*weights actual.beta <- rep(0,p) if (!is.null(unpen.index)) actual.unpen.beta <- rep(0,ncol(unpen.x)) { if (is.null(offset)) actual.linear.predictor <- rep(0,n) else actual.linear.predictor<-offset} actual.risk.score <- rep(1,n) ml.fraction <- rep(0,p) # boosting iterations x.double.vec <- as.double(x[subset.time.order,pen.index]) weight.double.vec <- as.double(weightmat) max.nz.vec <- as.integer(apply(weightmat,2,function(arg) max(which(arg != 0)))) max.1.vec <- as.integer(c(0,rev(cummin(rev(apply(weightmat,2,function(arg) ifelse(!any(arg != 1),length(arg),min(which(arg != 1)-1)))))))) uncens.C <- as.integer(uncens - 1) warnstep <- NULL first.score <- NULL presel.index <- c() for (actual.step in 0:stepno) { if (actual.step > 0 && any(stepsize.factor != 1)) { object$penalty[stepno,] <- penalty } weightmat.times.risk <- weightmat*actual.risk.score weightmat.times.risk.sum <- colSums(weightmat.times.risk) # update unpenalized covariates by one estimation step if (!is.null(unpen.index)) { if (actual.step == 1) { # calculations from step zero do not have to be repeated unpen.coefficients[actual.step+1,] <- actual.unpen.beta } else { x.bar <- (t(weightmat.times.risk) %*% unpen.x) / weightmat.times.risk.sum U <- colSums(unpen.x[uncens,] - x.bar) I <- matrix(0,ncol(unpen.x),ncol(unpen.x)) for (i in 1:n.uncens) { x.minus.bar <- t(t(unpen.x) - x.bar[i,]) I <- I + (t(x.minus.bar*(weightmat.times.risk[,i])) %*% x.minus.bar)/weightmat.times.risk.sum[i] } unpen.beta.delta <- drop(solve(I) %*% U) actual.unpen.beta <- actual.unpen.beta + unpen.beta.delta unpen.coefficients[actual.step+1,] <- actual.unpen.beta actual.linear.predictor <- actual.linear.predictor + drop(unpen.x %*% unpen.beta.delta) actual.risk.score <- exp(drop(actual.linear.predictor)) weightmat.times.risk <- weightmat*actual.risk.score weightmat.times.risk.sum <- colSums(weightmat.times.risk) } } if (actual.step == 0) { actual.Lambda <- rep(NA,length(object$event.times)) for (i in seq(along=object$event.times)) { actual.mask <- time[uncens] <= object$event.times[i] if (is.null(weights)) { actual.Lambda[i] <- sum(1/weightmat.times.risk.sum[actual.mask]) } else { actual.Lambda[i] <- sum(weights[uncens][actual.mask]/weightmat.times.risk.sum[actual.mask]) } } object$coefficients[actual.step+1,] <- actual.beta object$linear.predictor[actual.step+1,] <- actual.linear.predictor[reverse.time.order] object$Lambda[actual.step+1,] <- actual.Lambda next } if (x.is.01) { res <- .C("find_best01", x.double.vec, as.integer(n), as.integer(p), uncens.C, as.integer(length(uncens)), as.double(actual.beta), as.double(actual.risk.score), as.double(actual.linear.predictor), weight.double.vec, max.nz.vec, max.1.vec, as.double(weightmat.times.risk), as.double(weightmat.times.risk.sum), as.double(penalty), warncount=integer(1), min.index=integer(1), min.deviance=double(1), min.beta.delta=double(1), score.vec=double(p), DUP=FALSE,NAOK=TRUE ) } else { if ((criterion != "hscore" && criterion != "hpscore") || actual.step == 1) { res <- .C("find_best", x.double.vec, as.integer(n), as.integer(p), uncens.C, as.integer(length(uncens)), as.double(actual.beta), as.double(actual.risk.score), as.double(actual.linear.predictor), weight.double.vec, max.nz.vec, max.1.vec, as.double(weightmat.times.risk), as.double(weightmat.times.risk.sum), as.double(penalty), as.integer(criterion == "pscore" || criterion == "hpscore"), warncount=integer(1), min.index=integer(1), min.deviance=double(1), min.beta.delta=double(1), score.vec=double(p), DUP=FALSE,NAOK=TRUE ) } else { res <- .C("find_best_candidate", x.double.vec, as.integer(n), as.integer(p), uncens.C, as.integer(length(uncens)), as.double(actual.beta), as.double(actual.risk.score), as.double(actual.linear.predictor), weight.double.vec, max.nz.vec, max.1.vec, as.double(weightmat.times.risk), as.double(weightmat.times.risk.sum), as.double(penalty), as.integer(criterion == "pscore" || criterion == "hpscore"), as.integer(presel.index - 1), as.integer(length(presel.index)), warncount=integer(1), min.index=integer(1), min.deviance=double(1), min.beta.delta=double(1), score.vec=double(p), DUP=FALSE,NAOK=TRUE ) min.presel.score <- min(res$score.vec[presel.index]) if (length(presel.index) < length(first.score) && min.presel.score < max(first.score[-presel.index])) { new.candidates <- sort(union(which(first.score > min.presel.score),presel.index)) res <- .C("find_best_candidate", x.double.vec, as.integer(n), as.integer(p), uncens.C, as.integer(length(uncens)), as.double(actual.beta), as.double(actual.risk.score), as.double(actual.linear.predictor), weight.double.vec, max.nz.vec, max.1.vec, as.double(weightmat.times.risk), as.double(weightmat.times.risk.sum), as.double(penalty), as.integer(criterion == "pscore"), as.integer(new.candidates - 1), as.integer(length(new.candidates)), warncount=integer(1), min.index=integer(1), min.deviance=double(1), min.beta.delta=double(1), score.vec=double(p), DUP=FALSE,NAOK=TRUE ) } } } if (is.null(warnstep) && res$warncount > 0) warnstep <- actual.step min.index <- res$min.index min.deviance <- res$min.deviance min.beta.delta <- res$min.beta.delta if (return.score) object$scoremat[actual.step,] <- res$score.vec if (criterion == "hscore" || criterion == "hpscore") { if (actual.step == 1) first.score <- res$score.vec presel.index <- sort(union(presel.index,min.index)) } #cat("selected:",min.index,"(",min.deviance,")\n") if (trace) cat(object$xnames[pen.index][min.index]," ",sep="") # update the maximum likelihood fractions needed for penalty distribution if (!is.null(pendistmat)) { actual.x.bar <- apply(weightmat*actual.risk.score*x[subset.time.order,pen.index[min.index]],2,sum)/apply(weightmat*actual.risk.score,2,sum) I <- sum(apply((weightmat*actual.risk.score)*t(t(matrix(rep(x[subset.time.order,pen.index[min.index]],n.uncens),nrow(weightmat),ncol(weightmat))) - actual.x.bar)^2,2,sum)/apply(weightmat*actual.risk.score,2,sum)) nu <- I / (I + penalty[min.index]) ml.fraction[min.index] <- ml.fraction[min.index] + (1-ml.fraction[min.index])*nu } actual.beta[min.index] <- actual.beta[min.index] + min.beta.delta #print(actual.beta) actual.linear.predictor <- actual.linear.predictor + x[subset.time.order,pen.index[min.index]]*min.beta.delta actual.risk.score <- exp(drop(actual.linear.predictor)) weightmat.times.risk <- weightmat*actual.risk.score weightmat.times.risk.sum <- colSums(weightmat.times.risk) actual.Lambda <- rep(NA,length(object$event.times)) for (i in seq(along=object$event.times)) { actual.mask <- time[uncens] <= object$event.times[i] if (is.null(weights)) { actual.Lambda[i] <- sum(1/weightmat.times.risk.sum[actual.mask]) } else { actual.Lambda[i] <- sum(weights[uncens][actual.mask]/weightmat.times.risk.sum[actual.mask]) } } object$coefficients[actual.step+1,] <- actual.beta object$linear.predictor[actual.step+1,] <- actual.linear.predictor[reverse.time.order] object$Lambda[actual.step+1,] <- actual.Lambda # update the penalty if the user has chosen any other value than the default actual.stepsize.factor <- ifelse(length(stepsize.factor) >= min.index,stepsize.factor[min.index],stepsize.factor[1]) if (actual.stepsize.factor != 1 && ml.fraction[min.index] < 1) { if (is.null(pendistmat) || (min.index %in% connected.index && any(pendistmat[penpos[min.index],] != 0))) { I.index <- min.index actual.x.bar <- apply(weightmat*actual.risk.score*x[subset.time.order,pen.index[min.index]],2,sum)/apply(weightmat*actual.risk.score,2,sum) I <- sum(apply((weightmat*actual.risk.score)*t(t(matrix(rep(x[subset.time.order,pen.index[min.index]],n.uncens),nrow(weightmat),ncol(weightmat))) - actual.x.bar)^2,2,sum)/apply(weightmat*actual.risk.score,2,sum)) if (!is.null(pendistmat)) { connected <- connected.index[which(pendistmat[penpos[min.index],] != 0)] if (length(connected) > 0) { change.index <- connected[ml.fraction[connected] < 1] I.index <- c(I.index,change.index) } } I.vec <- .C("get_I_vec", as.double(x[subset.time.order,pen.index[I.index]]), as.integer(n), as.integer(length(I.index)), as.integer(length(uncens)), as.double(weightmat.times.risk), as.double(weightmat.times.risk.sum), I.vec=double(length(I.index)), DUP=FALSE )$I.vec old.penalty <- penalty[min.index] if (sf.scheme == "sigmoid") { new.nu <- max(1 - (1-(I.vec[1]/(I.vec[1]+penalty[min.index])))^actual.stepsize.factor,0.00001) # prevent penalty -> Inf penalty[min.index] <- (1/new.nu - 1)*I } else { penalty[min.index] <- (1/actual.stepsize.factor - 1)*I.vec[1] + penalty[min.index]/actual.stepsize.factor } if (penalty[min.index] < 0) penalty[min.index] <- 0 if (length(I.vec) > 1) { if (trace) { cat("\npenalty update for ",object$xnames[pen.index][min.index]," (mlf: ",round(ml.fraction[min.index],3),"): ",old.penalty," -> ",penalty[min.index],"\n",sep="") } change.I <- I.vec[2:length(I.vec)] if (trace) cat("adjusting penalty for covariates",paste(object$xnames[pen.index][change.index],collapse=", ",sep=""),"\n") new.target.penalty <- pendistmat[penpos[min.index],penpos[change.index]]* (1 - ml.fraction[change.index])*change.I/ ((1-actual.stepsize.factor)*pendistmat[penpos[min.index],penpos[change.index]]* (1-ml.fraction[min.index])*I.vec[1]/(I.vec[1]+old.penalty) + (1-ml.fraction[change.index])*change.I/(change.I+penalty[change.index])) - change.I penalty[change.index] <- ifelse(new.target.penalty > 0,new.target.penalty,penalty[change.index]) # loop version # for (actual.target in connected) { # if (ml.fraction[actual.target] < 1) { # if (trace) cat(object$xnames[pen.index][actual.target]," (mlf: ",round(ml.fraction[actual.target],3),"): ",penalty[actual.target]," -> ",sep="") # actual.x.bar <- apply(weightmat*actual.risk.score*x[subset.time.order,pen.index[actual.target]],2,sum)/apply(weightmat*actual.risk.score,2,sum) # I.target <- sum(apply((weightmat*actual.risk.score)*t(t(matrix(rep(x[subset.time.order,pen.index[actual.target]],n.uncens),nrow(weightmat),ncol(weightmat))) - actual.x.bar)^2,2,sum)/apply(weightmat*actual.risk.score,2,sum)) # new.target.penalty <- pendistmat[penpos[min.index],penpos[actual.target]]*(1 - ml.fraction[actual.target])*I.target/ # ((1-actual.stepsize.factor)*pendistmat[penpos[min.index],penpos[actual.target]]*(1-ml.fraction[min.index])*I/(I+old.penalty) + # (1-ml.fraction[actual.target])*I.target/(I.target+penalty[actual.target])) - # I.target # if (new.target.penalty > 0) penalty[actual.target] <- new.target.penalty # if (trace) cat(penalty[actual.target],"\n") # } # } } } } } if (trace) cat("\n") if (!is.null(warnstep)) warning(paste("potentially attempted to move towards a nonexisting maximum likelihood solution around step",warnstep)) # combine penalized and unpenalized covariates if (!is.null(object$unpen.index)) { object$p <- object$p + length(object$unpen.index) combined.coefficients <- Matrix(0,nrow(object$coefficients),object$p) combined.coefficients[,pen.index] <- object$coefficients combined.coefficients[,object$unpen.index] <- unpen.coefficients object$coefficients <- combined.coefficients } # if (return.score) { # # zmat <- qnorm(pchisq(ifelse(object$scoremat>70,70,object$scoremat),df=1)) # # pmat <- 1 - pchisq(object$scoremat,df=1) # # exclude.crit <- 0.05/(nrow(pmat)*ncol(pmat)) # # exclude <- apply(pmat <= exclude.crit,2,any) # # if (all(exclude)) exclude[] <- FALSE # # covmat <- cov(t(zmat[,!exclude])) # # # covmat <- cov(t(zmat)) # # object$p.val <- 1-pnorm(colSums(zmat) / sqrt(sum(covmat))) # actual.scoremat <- object$scoremat[round(seq(from=1,to=nrow(object$scoremat),length=10)),] # pmat <- 1 - pchisq(actual.scoremat,df=1) # exclude.crit <- 0.20/(nrow(pmat)) # exclude <- apply(pmat <= exclude.crit,2,any) # if (all(exclude)) exclude[] <- FALSE # pmat <- 1 - pchisq(actual.scoremat,df=1) # # Sigma <- cor(t(pmat)) # Sigma <- cor(t(pmat[,!exclude])) # Z.chol <- t(base::chol(Sigma)) # zmat <- qnorm(pchisq(ifelse(actual.scoremat>70,70,actual.scoremat),df=1)) # # zmat <- qnorm(pchisq(actual.scoremat,df=1)) # trans.zmat <- solve(Z.chol) %*% zmat # trans.zmat[trans.zmat < -8] <- -8 # trans.zmat[trans.zmat > 8] <- 8 # trans.pmat <- 1 - pnorm(trans.zmat) # chi.stat <- -2*colSums(log(trans.pmat)) # object$p.val <- 1 - pchisq(chi.stat,df=nrow(zmat)) # } class(object) <- "CoxBoost" object$logplik <- predict(object,type="logplik") object }