#' ---
#' title: "Discrete Simulation"
#' author: "Allyson Mateja and Megan Grieco"
#' date: "`r Sys.Date()`"
#' output: word_document
#' ---
#' 
## ----setup, include=FALSE------------------------------------------------------------------------------
#knitr::opts_chunk$set(echo = FALSE, message=F)
library(dplyr)
library(tidyr)
library(data.table)
library(abind)
library(ggplot2)

#' 
#' 
#' ## Simple Mixture of Discrete  Distribution
#' 
#' There are three types of individuals:
#' 
#' * always fail: have event at $t=1$
#' * effected: if treated have event at $t=4$, otherwise have event at $t=2$.
#' * never fail: have event  after $t=6$ (study end)
#' 
#' Let the proportions in the three populations be $p_A$, $p_E$, and $p_N$, respectively.
#' We have $p_A+p_E+p_N=1$, so we only need to define $p_A$ and $p_E$. 
#' 
#' There is independent censoring: where $\pi_C$ proportion of the study population
#' are censored at $t=3$ and the rest are censored at the study end, $t=6$. 
#' 
#' We test for a difference between survival distributions at $t=5$. This is a 
#' test of the type I error rate. 
#' 
#' The true survival curves are equal after $t=4$. Simulated data will have n=300 in each arm, $p_A=.5$ and $\pi_c=0.9$. We will vary $p_E$ from 0 to 0.4 for a total of 41 scenarios, each with 10,000 reps.
#' 
#' ## Simulation Results
#' 
#' 
## ------------------------------------------------------------------------------------------------------
sim_files <- list.files("C:/Users/matejaam/OneDrive - National Institutes of Health/_HDrive/Projects/bpcp/simulations/discrete/sim_discrete", pattern=".rds", full.names = T) %>% purrr::map(readRDS)
sim_files_diff <- list.files("C:/Users/matejaam/OneDrive - National Institutes of Health/_HDrive/Projects/bpcp/simulations/discrete/sim_discrete_diff", pattern=".rds", full.names = T) %>% purrr::map(readRDS)

#' 
## ------------------------------------------------------------------------------------------------------
summarizeCI<-function(ci,nullBeta=0){
  N<- nrow(ci)
  N.lo<- sum(!is.na(ci[,1]))
  N.hi<- sum(!is.na(ci[,2]))
  prop.Res.lo<- N.lo/N
  prop.Res.hi<- N.hi/N
  reject.lo<- sum(!is.na(ci[,1]) & ci[,1]>nullBeta)/N.lo
  reject.hi<- sum(!is.na(ci[,1]) & ci[,2]<nullBeta)/N.hi
  coverage.lo <- sum(!is.na(ci[,1]) & ci[,1]<nullBeta)/N.lo
  coverage.hi <- sum(!is.na(ci[,2]) & ci[,2]>nullBeta)/N.lo
  rejectTotal.lo<- sum(!is.na(ci[,1]) & ci[,1]>nullBeta)/N
  rejectTotal.hi<- sum(!is.na(ci[,1]) & ci[,2]<nullBeta)/N
  out<-c(N=N,N.lo=N.lo,N.hi=N.hi,prop.Res.lo=prop.Res.lo,
         prop.Res.hi=prop.Res.hi,reject.lo=reject.lo,
         reject.hi=reject.hi,
         coverage.lo, coverage.hi,
         rejectTotal.lo=rejectTotal.lo,
         rejectTotal.hi=rejectTotal.hi)
  out
}

#' 
#' 
## ------------------------------------------------------------------------------------------------------
pE <- seq(0, 0.4, 0.01)

#' 
#' 
## ------------------------------------------------------------------------------------------------------
combined <- abind(sim_files, along=1)
combined_diff <- abind(sim_files_diff, along=1)

#' 
#' 
## ------------------------------------------------------------------------------------------------------
out<- array(NA,c(4,11,length(pE)),
            dimnames=list(c("meld","meld.midp","delta.zo","delta.adjhy"),
                          c("N","N.lo","N.hi","prop.Res.lo","prop.Res.hi","reject.lo","reject.hi", "coverage.lo", "coverage.hi","rejectTotal.lo","rejectTotal.hi"),
                          c(paste0("Scenario ",1:length(pE))))
)

for (i in 1:length(pE)) {
  
  CImat <- combined[,,i]
  sout<-matrix(NA,ncol(combined)/2,11)

  s1<-summarizeCI(CImat[,1:2])
  sout[1,]<- s1 
  sout[2,]<- summarizeCI(CImat[,3:4])
  sout[3,]<- summarizeCI(CImat[,5:6])
  sout[4,]<- summarizeCI(CImat[,7:8])
    
  dimnames(sout) <- list(c("meld","meld.midp","delta.zo","delta.adjhy"),
      names(s1))
  
  out[,,i]<- sout
  
}

#out[,c(1,6,7),]

#' 
#' 
#' 
## ------------------------------------------------------------------------------------------------------
# make 3D output into data frame
out_df <- as.data.frame.table(out[,c(1,6:9),])
names(out_df) <- c("method","calc","Scenario","res")

# calculated expected # at risk - n*(1-pA-pE)*(1-pic)
scenario_calcs <- data.frame(
  pE=pE,
  pA=0.5,
  n=300,
  pi.c=0.9,
  Scenario=paste0("Scenario ",seq(1:length(pE)))
) %>% mutate(n.risk=n*(1-pA-pE)*(1-pi.c))

# combine with full results data
out_df_2 <- out_df %>% left_join(scenario_calcs)

out_df_2$method <- factor(out_df_2$method, levels = c("meld", "meld.midp", "delta.zo", "delta.adjhy"), 
                          labels = c("Melding", "Melding (mid-p)", "Delta (Greenwood)", "Delta (Borkowf)"))

#' 
#' The following plots show the simulation results for each method and by lower or upper-tailed tests. A red dashed line is drawn at 0.025 to correspond to a two-sided 95% confidence interval. All melded Type I error rates remained below 2.5%.
#' 
#' The first plot shows data by pE. The second plot shows data by the expected number at risk in each arm just before the test time, $n*(1-p_A-p_E)*(1-\pi_c)$. As to be expected, $p_E$ and expected number of risk have an inverse relationship in terms of Type I error rate. 
#' 
## ----fig.height=6--------------------------------------------------------------------------------------
# Option 1: plot by pE (facet by method and low/high)
ggplot(out_df_2 %>% filter(calc!="N" & method !="delta.none" & !(grepl("coverage", calc))) %>% mutate(calc=ifelse(calc=="reject.lo","Lower","Upper")), aes(x=pE, y=res)) + geom_hline(yintercept=0.025, linetype="dashed",color="red") + geom_point() + facet_grid(method~calc, ) + theme_bw() + theme(strip.background = element_blank()) + labs(y="Type I Error Rate", x="Proportion of Effected Individuals")
ggsave("C:/Users/matejaam/OneDrive - National Institutes of Health/_HDrive/Projects/bpcp/simulations/discrete/pE_plot.pdf", height=6, width=5)

#' 
## ----fig.height=6--------------------------------------------------------------------------------------
# Option 2: plot by number at risk (facet by method and low/high)
ggplot(out_df_2 %>% filter(calc!="N" & method !="delta.none" & !(grepl("coverage", calc))) %>% mutate(calc=ifelse(calc=="reject.lo","Lower","Upper")), aes(x=n.risk, y=res)) + geom_hline(yintercept=0.025, linetype="dashed",color="red") + geom_point() + facet_grid(method~calc) + theme_bw() + theme(strip.background = element_blank()) + labs(y="Type I Error Rate", x="Expected Number at Risk")
ggsave("C:/Users/matejaam/OneDrive - National Institutes of Health/_HDrive/Projects/bpcp/simulations/discrete/num_risk_plot.pdf", height=6, width=5)

#' 
#' 
## ----fig.height=6--------------------------------------------------------------------------------------
# Option 2: plot by number at risk (facet by method and low/high)
ggplot(out_df_2 %>% filter(calc!="N" & method !="delta.none" & !(grepl("reject", calc))) %>% mutate(calc=ifelse(calc=="coverage.lo","Lower","Upper")), aes(x=n.risk, y=res)) + 
  geom_hline(yintercept=0.975, linetype="dashed",color="red") + 
  geom_point() + 
  facet_grid(method~calc) + 
  theme_bw() + theme(strip.background = element_blank()) + 
  labs(y="Confidence Interval Coverage", x="Expected Number at Risk")
ggsave("C:/Users/matejaam/OneDrive - National Institutes of Health/_HDrive/Projects/bpcp/simulations/discrete/num_risk_plot_coverage.pdf", height=6, width=5)

#' 
#' 
#' 
## ------------------------------------------------------------------------------------------------------
out_diff <- array(NA,c(4,11,length(pE)),
            dimnames=list(c("meld","meld.midp","delta.zo","delta.adjhy"),
                          c("N","N.lo","N.hi","prop.Res.lo","prop.Res.hi","reject.lo","reject.hi", "coverage.lo", "coverage.hi","rejectTotal.lo","rejectTotal.hi"),
                          c(paste0("Scenario ",1:length(pE))))
)

for (i in 1:length(pE)) {
  
  CImat <- combined_diff[,,i]
  sout<-matrix(NA,ncol(combined_diff)/2,11)

  s1<-summarizeCI(CImat[,1:2])
  sout[1,]<- s1 
  sout[2,]<- summarizeCI(CImat[,3:4])
  sout[3,]<- summarizeCI(CImat[,5:6])
  sout[4,]<- summarizeCI(CImat[,7:8])
    
  dimnames(sout) <- list(c("meld","meld.midp","delta.zo","delta.adjhy"),
      names(s1))
  
  out_diff[,,i]<- sout
  
}

#out[,c(1,6,7),]

#' 
#' 
#' 
## ------------------------------------------------------------------------------------------------------
# make 3D output into data frame
out_diff_df <- as.data.frame.table(out_diff[,c(1,6:9),])
names(out_diff_df) <- c("method","calc","Scenario","res")

# combine with full results data
out_diff_df_2 <- out_diff_df %>% left_join(scenario_calcs)

out_diff_df_2$method <- factor(out_diff_df_2$method, levels = c("meld", "meld.midp", "delta.zo", "delta.adjhy"), 
                          labels = c("Melding", "Melding (mid-p)", "Delta (Greenwood)", "Delta (Borkowf)"))

#' 
## ----fig.height=6--------------------------------------------------------------------------------------
# Option 2: plot by number at risk (facet by method and low/high)
ggplot(out_diff_df_2 %>% filter(calc!="N" & method !="delta.none" & !(grepl("coverage", calc))) %>% mutate(calc=ifelse(calc=="reject.lo","Lower","Upper")), aes(x=n.risk, y=res)) + geom_hline(yintercept=0.025, linetype="dashed",color="red") + geom_point() + facet_grid(method~calc) + theme_bw() + theme(strip.background = element_blank()) + labs(y="Type I Error Rate", x="Expected Number at Risk")
ggsave("C:/Users/matejaam/OneDrive - National Institutes of Health/_HDrive/Projects/bpcp/simulations/discrete/num_risk_diff_plot.pdf", height=6, width=5)

#' 
#' 
## ----fig.height=6--------------------------------------------------------------------------------------
# Option 2: plot by number at risk (facet by method and low/high)
ggplot(out_diff_df_2 %>% filter(calc!="N" & method !="delta.none" & !(grepl("reject", calc))) %>% mutate(calc=ifelse(calc=="coverage.lo","Lower","Upper")), aes(x=n.risk, y=res)) + 
  geom_hline(yintercept=0.975, linetype="dashed",color="red") + 
  geom_point() + 
  facet_grid(method~calc) + 
  theme_bw() + theme(strip.background = element_blank()) + 
  labs(y="Confidence Interval Coverage", x="Expected Number at Risk")
ggsave("C:/Users/matejaam/OneDrive - National Institutes of Health/_HDrive/Projects/bpcp/simulations/discrete/num_risk_diff_plot_coverage.pdf", height=6, width=5)

#' 
#' 
#' 
## ------------------------------------------------------------------------------------------------------
# adding on KM curve with risk table to have all figures in one place

createData<-function(n,p,pi.c,prop.treated=0.5){
  pA<-p[1]
  pE<-p[2]
  pN<- 1-pA-pE
  if (pA>1 | pA<0 | pE>1 | pE<0 | pN>1 | pN<0) stop("incorrect p")
  cnt<- rmultinom(1,n,prob=c(pA,pE,pN))
  x<-rep(c(1,2,7),times=cnt)
  x<- sample(x,replace=FALSE)
  n.treated<- round(n*prop.treated)
  n.cntl<- n-n.treated
  group<- c(rep(1,n.cntl),rep(2,n.treated))
  # treatment changes t=2 to t=4 in effected population
  x[group==2 & x==2]<- 4
  cens<- rbinom(n,1,pi.c)
  cens[cens==1]<- 3
  cens[cens==0]<- 6
  y<- pmin(x,cens)
  status<-rep(0,n)
  status[y==x]<- 1
  #data.frame(group=group,x=x,cens=cens,y=y,status=status)
  # a list is faster to use in simulations
  list(group=group,x=x,cens=cens,y=y,status=status)
}




#' 
## ----fig.width=7, fig.height=8-------------------------------------------------------------------------
# true model
pA = 0.1 # 10% fail at time 2
pE = 0.20 # 20% fail at time 4 if randomized to the treated arm but fail at time 2 if randomized to the control arm
pN = 1 - pA - pE # remaining have events after time = 6

# Time points
times<- seq(from=0,to=6, by=0.001)

# Survival probabilities
# Treated arm
S_treated <- rep(NA, length(times))

S_treated[which(times < 2)] <- 1
S_treated[which(times >= 2 & times < 4)] <- 1 - pA
S_treated[which(times >= 4)] <- 1 - pA - pE

# Control arm
S_control <- rep(NA, length(times))

S_control[which(times < 2)] <- 1
S_control[which(times >= 2)] <- 1 - pA - pE
                    

plot_df <- data.frame(
  Time=c(times,times),
  Arm=c(rep("1",length(times)),rep("2",length(times))),
  Survival=c(S_control,S_treated)
)

library(survival)
library(survminer)
# test out the simData function
set.seed(35)
d<-createData(600,c(pA,pE),.9)
table(y=d$y,status=d$status,group=d$group)
s<-survfit(Surv(y,status)~group,data=d)

#overlay the two plots
surv_p <- ggsurvplot(s,risk.table=TRUE, xlim=c(0,6),break.x.by=1, palette=c("navy","#F8766D"), size ="strata")

surv_p$plot <- surv_p$plot +
  geom_line(data=plot_df[which(plot_df$Arm=="1"),],aes(x=Time,y=Survival,linetype="Exponential Model"), color="#F8766D", inherit.aes = F,linewidth =1) +
  geom_line(data=plot_df[which(plot_df$Arm=="2"),],aes(x=Time,y=Survival),linetype="dashed", color="navy", inherit.aes = F,linewidth = 0.7) + 
  theme_bw() +
  scale_color_manual(values=c("#F8766D","navy","#F8766D","navy"),breaks = c("group=1", "group=2")) +
  scale_size_manual(values = c("group=1" = 1.25, "group=2" = 1)) +
  scale_y_continuous(limits = c(0,1),breaks=seq(0,1,0.25)) +
  scale_linetype_manual(values=c("Exponential Model"="dashed")) + labs(linetype=" ") +
  guides(linetype = "none",
         size="none") + theme(legend.position = "bottom")
surv_p

ggsave("Discrete_w_Model.pdf",plot=surv_p)


pdf("Discrete_w_Model.pdf", onefile = TRUE)
print(surv_p)
dev.off()

