### code to create table 1 and table 2 from paper 
library(mboost)
library(survival)
library(mvtnorm)
library(Rcpp)
library(RcppArmadillo)

# False positives & false negatives function
Rcpp::cppFunction('arma::rowvec FPFNSeSpLik_rcpp(arma::rowvec TrueBeta, arma::rowvec beta){
  arma::uvec equal_0 = find(beta==0 && TrueBeta==0);
  arma::uvec equal_1 = find(beta!=0 && TrueBeta!=0);
  arma::uvec not_equal_0 = find(beta!=0 && TrueBeta==0);
  arma::uvec not_equal_1 = find(beta==0 && TrueBeta!=0);
  arma::uvec beta_not_0 = find(beta!=0);
  arma::uvec tb_not_0 = find(TrueBeta!=0);
  arma::uvec tb_0 = find(TrueBeta==0);
  
  double TN = (equal_0).size();
  double TP = (equal_1).size();
  double FN = (not_equal_1).size();
  double FP = (not_equal_0).size();
  double num_selected = (beta_not_0).size();
  
  double Se = TP/(tb_not_0).size();
  double Sp = TN/(tb_0).size();
  double FDR = FP/num_selected;
  
  arma::rowvec output(6);
  output(0) = FP;
  output(1) = FN;
  output(2) = Se;
  output(3) = Sp;
  output(4) = FDR;
  output(5) = num_selected;
  
  return(output);
}', depends="RcppArmadillo")

# TABLE 1 - stratified

p=4000 # stratified
p_true=100
adj_var=0
TrueBeta <- rep(0, p)
TrueBeta_index=sample((1+adj_var):p,p_true,replace=FALSE) # start random p_true values after the adjustment variables
signbeta <- sample(c(-1,1),p_true,replace=T)
mag = 2
TrueBeta[TrueBeta_index] =mag*signbeta

row1 <- NULL
row2 <- NULL
row3 <- NULL
row4 <- NULL
row5 <- NULL
row6 <- NULL
row7 <- NULL

runtime1 = 0
runtime2 = 0
runtime3 = 0
runtime4 = 0
runtime5 = 0
runtime6 = 0
runtime7 = 0

num_it2 = 0
num_it3 = 0
num_it4 = 0
num_it5 = 0
num_it6 = 0
num_it7 = 0

for(j in 1:10){
  
  set.seed(123+j)
  
  data_sim1 <- simulate_survival_cox(true_beta=TrueBeta,
                                     base_hazard="weibull",
                                     base_hazard_scale = 1/exp(seq(1,20,length=10)),
                                     base_hazard_shape = rep(5,length=10),
                                     num_facility=10,
                                     input_facility_size=150, cov_structure="ar",
                                     block_size=10, rho=0.6, censor_dist="unif",
                                     censor_const=200, tau=Inf, normalized=F)
  z <- as.matrix(data_sim1[,-c(1,2,3)])
  time = data_sim1$time
  delta=data_sim1$delta
  facility_idx=data_sim1$facility_idx
  N = length(time)
  df <- data.frame(data_sim1$time, data_sim1$delta,  data_sim1$facility_idx, z)

  colnames(df)[1:3] <- c("time", "delta", "facility_idx")
  
  temp <- paste(colnames(z),collapse="+")
  fmla <- as.formula(paste("Surv(time,delta) ~ ", paste(c("strata(facility_idx)", temp), collapse= "+")))
  fmla_mboost <- as.formula(paste("Surv(time,delta) ~ ", paste(temp, collapse= "+")))
  
  old_time1 = proc.time()
  test1 <- boosting_core(fmla, data=df, rate=0.1, control=500)
  runtime1 <- c(runtime1, proc.time()-old_time1)
  
  # test cv version
  old_time2 = proc.time()
  mstop_cv <- cross_validation_func_update(K=5, time, delta, z, facility_idx, rate=0.1)$mstop
  test2 <- boosting_core(fmla, data=df, rate=0.1, control=mstop_cv)
  runtime2 <- c(runtime2, proc.time()-old_time2)
  
   # test num_selected method
   old_time3 = proc.time()
   test3 <-  boosting_core(fmla, data=df, rate=0.1, control_parameter=100 , control_method="num_selected")
   runtime3 <- c(runtime3, proc.time()-old_time3)

   # test likelihood method 
   old_time4 = proc.time()
   test4 <- boosting_core(fmla, data=df, rate=0.1, control_parameter=.001 , control_method="likelihood")
   runtime4 <- c(runtime4, proc.time()-old_time4)

   # test bic method
   old_time5 = proc.time()
   mstop_bic <- boosting_core(fmla, data=df, rate=0.1, control_parameter=FALSE, control_method="BIC")$mstop
   test5 <- boosting_core(fmla, data=df, rate=0.1, control=mstop_bic)
   runtime5 <- c(runtime5, proc.time()-old_time5)

   # test 6 mboost comparison
   dat <- data.frame(cbind(z,time,delta))
   old_time6 = proc.time()
   fit_mboost_cvrisk = glmboost(fmla_mboost,data=dat, family=CoxPH(), control=boost_control(mstop=5000))
   cv_folds <- cv(model.weights(fit_mboost_cvrisk), type = "kfold", B=5)
   cvm <- cvrisk(fit_mboost_cvrisk, folds = cv_folds, papply = lapply)
   fit_mboost = glmboost(fmla_mboost,data=dat, family=CoxPH(), control=boost_control(mstop=mstop(cvm)))
   runtime6 <- c(runtime6, proc.time()-old_time6)

   beta_mboost_idx2 = coef(fit_mboost)
   beta_mboost2= numeric(p)
   temp <- paste("V", 1:p, sep="")
   for(i in 1:p){
     if(!is.na(beta_mboost_idx2[temp[i]] )){
       beta_mboost2[i] = beta_mboost_idx2[temp[i]]
     }
   }

   # mboost - run with 500 iterations
   old_time7 = proc.time()
   fit_mboost2 = glmboost(fmla_mboost,data=dat, family=CoxPH(), control=boost_control(mstop=500)) 
   runtime7 <- c(runtime7, proc.time()-old_time7)
   beta_mboost_idx2_2 = coef(fit_mboost2)
   beta_mboost2_2 = numeric(p)
   temp <- paste("V", 1:p, sep="")
   for(i in 1:p){
     if(!is.na(beta_mboost_idx2_2[temp[i]] )){
       beta_mboost2_2[i] = beta_mboost_idx2_2[temp[i]]
     }
   }

  # calculate MSE
  MSE1 <- sum((TrueBeta-test1$beta)^2)
  MSE2 <- sum((TrueBeta-test2$beta)^2)
  MSE3 <- sum((TrueBeta-test3$beta)^2)
  MSE4 <- sum((TrueBeta-test4$beta)^2)
  MSE5 <- sum((TrueBeta-test5$beta)^2)
  MSE4 <- sum((TrueBeta-test4$beta)^2)
  MSE6 <- sum((TrueBeta-beta_mboost2)^2)
  MSE7 <- sum((TrueBeta-beta_mboost2_2)^2)
  
  # FP, FN, Se, Sp, FDR, # selected
  row1 <- rbind(row1, c(FPFNSeSpLik_rcpp(TrueBeta,test1$beta), MSE1, test1$mstop))
  row2 <- rbind(row2, c(FPFNSeSpLik_rcpp(TrueBeta,test2$beta), MSE2, test2$mstop))
  row3 <- rbind(row3, c(FPFNSeSpLik_rcpp(TrueBeta,test3$beta), MSE3, test3$mstop))
  row4 <- rbind(row4, c(FPFNSeSpLik_rcpp(TrueBeta,test4$beta), MSE4, test4$mstop))
  row5 <- rbind(row5, c(FPFNSeSpLik_rcpp(TrueBeta,test5$beta), MSE5, test5$mstop))
  row7 <- rbind(row7, c(FPFNSeSpLik_rcpp(TrueBeta,beta_mboost2_2), MSE7, 500))
  row6 <- rbind(row6, c(FPFNSeSpLik_rcpp(TrueBeta,beta_mboost2), MSE6, mstop(cvm)))
  
}


df_comparison_avg_sim1 <- rbind(colMeans(row1), colMeans(row2), colMeans(row3), colMeans(row4),colMeans(row5), colMeans(row7), colMeans(row6))
colnames(df_comparison_avg_sim1) <- c("FP", "FN", "Se", "Sp", "FDR", "# selected", "MSE", "# iterations")
rownames(df_comparison_avg_sim1) <- c("survBoost (fixed)","survBoost (cv)","survBoost (# selected)", "survBoost (likelihood)", "survBoost (EBIC)", "mboost (fixed)", "mboost (CV)")
df_comparison_avg_sim1 <- df_comparison_avg_sim1[,c(6,1,2,3,4,5,7,8)]


runtime1_elapsed <- c(runtime1[4], runtime1[9], runtime1[14], runtime1[19], runtime1[24], runtime1[29],runtime1[34],runtime1[39],runtime1[44],runtime1[49])
mean(runtime1_elapsed)
runtime2_elapsed <- c(runtime2[4], runtime2[9], runtime2[14], runtime2[19], runtime2[24], runtime2[29],runtime2[34],runtime2[39],runtime2[44],runtime2[49])
mean(runtime2_elapsed)
runtime3_elapsed <- c(runtime3[4], runtime3[9], runtime3[14], runtime3[19], runtime3[24], runtime3[29],runtime3[34],runtime3[39],runtime3[44],runtime3[49])
mean(runtime3_elapsed)
runtime4_elapsed <- c(runtime4[4], runtime4[9], runtime4[14], runtime4[19], runtime4[24], runtime4[29],runtime4[34],runtime4[39],runtime4[44],runtime4[49])
mean(runtime4_elapsed)
runtime5_elapsed <- c(runtime5[4], runtime5[9], runtime5[14], runtime5[19], runtime5[24], runtime5[29],runtime5[34],runtime5[39],runtime5[44],runtime5[49])
mean(runtime5_elapsed)
runtime6_elapsed <- c(runtime6[4], runtime6[9], runtime6[14], runtime6[19], runtime6[24], runtime6[29],runtime6[34],runtime6[39],runtime6[44],runtime6[49])
mean(runtime6_elapsed)
runtime7_elapsed <- c(runtime7[4], runtime7[9], runtime7[14], runtime7[19], runtime7[24], runtime7[29],runtime7[34],runtime7[39],runtime7[44],runtime7[49])
mean(runtime7_elapsed)

runtime_avgs <- c(mean(runtime1_elapsed),mean(runtime2_elapsed),mean(runtime3_elapsed),mean(runtime4_elapsed),mean(runtime5_elapsed),mean(runtime7_elapsed),mean(runtime6_elapsed))
df_comparison_avg_sim1 <- cbind(df_comparison_avg_sim1 , runtime_avgs)

# ADD SD FOR ALL COLUMNS TO TABLE
sd1 <- apply(row1, 2, sd)
sd2 <- apply(row2, 2, sd)
sd3 <- apply(row3, 2, sd)
sd4 <- apply(row4, 2, sd)
sd5 <- apply(row5, 2, sd)
sd6 <- apply(row6, 2, sd)
sd7 <- apply(row7, 2, sd)

sd(runtime1_elapsed)
sd(runtime2_elapsed)
sd(runtime3_elapsed)
sd(runtime4_elapsed)
sd(runtime5_elapsed)
sd(runtime6_elapsed)
sd(runtime7_elapsed)

library(xtable)
#xtable(df_comparison_avg_sim1)
xtable(df_comparison_avg_sim1,digits=0)

##########################
## TABLE 2 - unstratified 

p=1000 # unstratified
p_true=100
adj_var=0
TrueBeta <- rep(0, p)
TrueBeta_index=sample((1+adj_var):p,p_true,replace=FALSE) # start random p_true values after the adjustment variables
signbeta <- sample(c(-1,1),p_true,replace=T)
mag = 2
TrueBeta[TrueBeta_index] =mag*signbeta

row1 <- NULL
row2 <- NULL
row3 <- NULL
row4 <- NULL
row5 <- NULL
row6 <- NULL
row7 <- NULL

runtime1 = 0
runtime2 = 0
runtime3 = 0
runtime4 = 0
runtime5 = 0
runtime6 = 0
runtime7 = 0

num_it2 = 0
num_it3 = 0
num_it4 = 0
num_it5 = 0
num_it6 = 0
num_it7 = 0

for(j in 1:10){
  
  set.seed(123+j)
  
  data_sim3 <- simulate_survival_cox(true_beta=TrueBeta,
                                     base_hazard="weibull",
                                     base_hazard_scale = 2,
                                     base_hazard_shape = 3,
                                     num_facility=1,
                                     input_facility_size=1000, cov_structure="ar",
                                     block_size=1000, rho=0.6, censor_dist="unif",
                                     censor_const=1.1, tau=Inf, normalized=F)
  mean(data_sim3$time) # average survival time
  mean(data_sim3$delta) # percent with event
  length(data_sim3$delta[which(data_sim3$delta==0)])/length(data_sim3$delta) # censoring rate
  z <- as.matrix(data_sim3[,-c(1,2,3)])
  time = data_sim3$time
  delta=data_sim3$delta
  facility_idx=data_sim3$facility_idx
  N = length(time)
  df <- data.frame(data_sim3$time, data_sim3$delta,  data_sim3$facility_idx, z)
  
  colnames(df)[1:3] <- c("time", "delta", "facility_idx")
  
  temp <- paste(colnames(z),collapse="+")
  fmla_mboost <- as.formula(paste("Surv(time,delta) ~ ", paste(temp, collapse= "+")))
  fmla <- as.formula(paste("Surv(time,delta) ~ ", paste(temp, collapse= "+"))) 
  
  # fixed number of iterations
  old_time1 = proc.time()
  test1 <- boosting_core(fmla, data=df, rate=0.1, control=500) 
  runtime1 <- c(runtime1, proc.time()-old_time1)
  
  # test cv version
  old_time2 = proc.time()
  mstop_cv <- cross_validation_func_update(K=5, time, delta, z, facility_idx, rate=0.1)$mstop
  test2 <- boosting_core(fmla, data=df, rate=0.1, control=mstop_cv)
  runtime2 <- c(runtime2, proc.time()-old_time2)
  
  # test num_selected method
  old_time3 = proc.time()
  test3 <-  boosting_core(fmla, data=df, rate=0.1, control_parameter=100 , control_method="num_selected")
  runtime3 <- c(runtime3, proc.time()-old_time3)
  
  # test likelihood method 
  old_time4 = proc.time()
  test4 <- boosting_core(fmla, data=df, rate=0.1, control_parameter=.001 , control_method="likelihood")
  runtime4 <- c(runtime4, proc.time()-old_time4)
  
  # test bic method
  old_time5 = proc.time()
  mstop_bic <- boosting_core(fmla, data=df, rate=0.1, control_parameter=FALSE, control_method="BIC")$mstop
  test5 <- boosting_core(fmla, data=df, rate=0.1, control=mstop_bic)
  runtime5 <- c(runtime5, proc.time()-old_time5)
  
  # test 6 mboost comparison
  dat <- data.frame(cbind(z,time,delta))
  old_time6 = proc.time()
  fit_mboost_cvrisk = glmboost(fmla_mboost,data=dat, family=CoxPH(), control=boost_control(mstop=5000))
  cv_folds <- cv(model.weights(fit_mboost_cvrisk), type = "kfold", B=5)
  cvm <- cvrisk(fit_mboost_cvrisk, folds = cv_folds, papply = lapply)
  fit_mboost = glmboost(fmla_mboost,data=dat, family=CoxPH(), control=boost_control(mstop=mstop(cvm)))
  runtime6 <- c(runtime6, proc.time()-old_time6)
  beta_mboost_idx2 = coef(fit_mboost)
  beta_mboost2= numeric(p)
  temp <- paste("V", 1:p, sep="")
  for(i in 1:p){
    if(!is.na(beta_mboost_idx2[temp[i]] )){
      beta_mboost2[i] = beta_mboost_idx2[temp[i]]
    }
  }
  
  # mboost - run with 500 iterations
  old_time7 = proc.time()
  fit_mboost2 = glmboost(fmla_mboost,data=dat, family=CoxPH(), control=boost_control(mstop=500)) 
  runtime7 <- c(runtime7, proc.time()-old_time7)
  beta_mboost_idx2_2 = coef(fit_mboost2)
  beta_mboost2_2 = numeric(p)
  temp <- paste("V", 1:p, sep="")
  for(i in 1:p){
    if(!is.na(beta_mboost_idx2_2[temp[i]] )){
      beta_mboost2_2[i] = beta_mboost_idx2_2[temp[i]]
    }
  }
  
  # calculate MSE
  MSE1 <- sum((TrueBeta-test1$beta)^2)
  MSE2 <- sum((TrueBeta-test2$beta)^2)
  MSE3 <- sum((TrueBeta-test3$beta)^2)
  MSE4 <- sum((TrueBeta-test4$beta)^2)
  MSE5 <- sum((TrueBeta-test5$beta)^2)
  MSE4 <- sum((TrueBeta-test4$beta)^2)
  MSE6 <- sum((TrueBeta-beta_mboost2)^2)
  MSE7 <- sum((TrueBeta-beta_mboost2_2)^2)
  
  # FP, FN, Se, Sp, FDR, # selected
  row1 <- rbind(row1, c(FPFNSeSpLik_rcpp(TrueBeta,test1$beta), MSE1, test1$mstop))
  row2 <- rbind(row2, c(FPFNSeSpLik_rcpp(TrueBeta,test2$beta), MSE2, test2$mstop))
  row3 <- rbind(row3, c(FPFNSeSpLik_rcpp(TrueBeta,test3$beta), MSE3, test3$mstop))
  row4 <- rbind(row4, c(FPFNSeSpLik_rcpp(TrueBeta,test4$beta), MSE4, test4$mstop))
  row5 <- rbind(row5, c(FPFNSeSpLik_rcpp(TrueBeta,test5$beta), MSE5, test5$mstop))
  row7 <- rbind(row7, c(FPFNSeSpLik_rcpp(TrueBeta,beta_mboost2_2), MSE7, 500))
  row6 <- rbind(row6, c(FPFNSeSpLik_rcpp(TrueBeta,beta_mboost2), MSE6, mstop(cvm)))
  
}


df_comparison_avg_sim1 <- rbind(colMeans(row1), colMeans(row2), colMeans(row3), colMeans(row4),colMeans(row5), colMeans(row7), colMeans(row6))

colnames(df_comparison_avg_sim1) <- c("FP", "FN", "Se", "Sp", "FDR", "# selected", "MSE", "# iterations")
rownames(df_comparison_avg_sim1) <- c("survBoost (fixed)","survBoost (cv)","survBoost (# selected)", "survBoost (likelihood)", "survBoost (EBIC)", "mboost (fixed)", "mboost (CV)")
df_comparison_avg_sim1 <- df_comparison_avg_sim1[,c(6,1,2,3,4,5,7,8)]


runtime1_elapsed <- c(runtime1[4], runtime1[9], runtime1[14], runtime1[19], runtime1[24], runtime1[29],runtime1[34],runtime1[39],runtime1[44],runtime1[49])
mean(runtime1_elapsed)
runtime2_elapsed <- c(runtime2[4], runtime2[9], runtime2[14], runtime2[19], runtime2[24], runtime2[29],runtime2[34],runtime2[39],runtime2[44],runtime2[49])
mean(runtime2_elapsed)
runtime3_elapsed <- c(runtime3[4], runtime3[9], runtime3[14], runtime3[19], runtime3[24], runtime3[29],runtime3[34],runtime3[39],runtime3[44],runtime3[49])
mean(runtime3_elapsed)
runtime4_elapsed <- c(runtime4[4], runtime4[9], runtime4[14], runtime4[19], runtime4[24], runtime4[29],runtime4[34],runtime4[39],runtime4[44],runtime4[49])
mean(runtime4_elapsed)
runtime5_elapsed <- c(runtime5[4], runtime5[9], runtime5[14], runtime5[19], runtime5[24], runtime5[29],runtime5[34],runtime5[39],runtime5[44],runtime5[49])
mean(runtime5_elapsed)
runtime6_elapsed <- c(runtime6[4], runtime6[9], runtime6[14], runtime6[19], runtime6[24], runtime6[29],runtime6[34],runtime6[39],runtime6[44],runtime6[49])
mean(runtime6_elapsed)
runtime7_elapsed <- c(runtime7[4], runtime7[9], runtime7[14], runtime7[19], runtime7[24], runtime7[29],runtime7[34],runtime7[39],runtime7[44],runtime7[49])
mean(runtime7_elapsed)

runtime_avgs <- c(mean(runtime1_elapsed),mean(runtime2_elapsed),mean(runtime3_elapsed),mean(runtime4_elapsed), mean(runtime5_elapsed),mean(runtime7_elapsed),mean(runtime6_elapsed))
df_comparison_avg_sim1 <- cbind(df_comparison_avg_sim1 , runtime_avgs)

# ADD SD FOR ALL COLUMNS TO TABLE
sd1 <- apply(row1, 2, sd)
sd2 <- apply(row2, 2, sd)
sd3 <- apply(row3, 2, sd)
sd4 <- apply(row4, 2, sd)
sd5 <- apply(row5, 2, sd)
sd6 <- apply(row6, 2, sd)
sd7 <- apply(row7, 2, sd)

sd(runtime1_elapsed)
sd(runtime2_elapsed)
sd(runtime3_elapsed)
sd(runtime4_elapsed)
sd(runtime5_elapsed)
sd(runtime6_elapsed)
sd(runtime7_elapsed)

library(xtable)
# xtable(df_comparison_avg_sim1)
xtable(df_comparison_avg_sim1,digits=0)


