#
# Set up for the test
#
library(survival4)
library(date)
#library(ratetables)
options(na.action="na.omit", contrasts=c("contr.treatment","contr.poly"))
#
# Compute some answers "by hand"
#     For test1 and test2 data
byhand1 <- function(coef, newx) {
    s <- exp(coef)
    loglik <- 2*coef - (log(3*s+3) + 2*log(s+3))
    u <- (6 + 3*s - s^2) / ((s+1)*(s+3))
    imat <- s/(s+1)^2 + 6*s/(s+3)^2

    x <- c(1,1,1,0,0,0)
    status <- c(1,0,1,1,0,1)
    xbar <- c(s/(s+1), s/(s+3), 0, 0)
    haz <- c(1/(3*s+3), 2/(s+3), 0, 1 )
    ties <- c(1,1,2,2,3,4)
    wt <- c(s,s,s,1,1,1)
    mart <- c(1,0,1,1,0,1) -  wt* (cumsum(haz))[ties]

    score <- rep(6,0)
    for (i in 1:6) {
        j <- ties[i]
        score[i] <-  -wt[i]*(cumsum((x[i]-xbar) * haz))[j]
        score[i] <- score[i] + wt[i]* ((x[i]-xbar)*status[i])[j]
        }

    scho <- c(1/(s+1), (3-s)/(3+s), 0)

    surv  <- exp(-cumsum(haz)* exp(coef*newx))
    varhaz.g <- cumsum(c(1/(3*s+3)^2, 2/(s+3)^2, 0, 1 ))

    varhaz.d <- cumsum((newx-xbar) * haz)

    varhaz <- (varhaz.g + varhaz.d0^2/ imat) * exp(2*coef*newx)

    names(xbar) <- names(haz) <- 1:4
    names(surv) <- names(varhaz) <- 1:4
    list(loglik=loglik, u=u, imat=imat, xbar=xbar, haz=haz,
             mart=mart,  score=score,
                scho=scho, surv=surv, var=varhaz,
                varhaz.g=varhaz.g, varhaz.d=varhaz.d)
    }


byhand2 <- function(coef, newx) {
    s <- exp(coef)

    loglik <- 4*coef - log(s+1) - log(s+2) - 3*log(3*s+2) - 2*log(3*s+1)
    u <- 1/(s+1) +  1/(3*s+1) + 4/(3*s+2) -
                 ( s/(s+2) +3*s/(3*s+2) + 3*s/(3*s+1))
    imat <- s/(s+1)^2 + 2*s/(s+2)^2 + 6*s/(3*s+2)^2 +
            3*s/(3*s+1)^2 + 3*s/(3*s+1)^2 + 12*s/(3*s+2)^2

    hazard <-c( 1/(s+1), 1/(s+2), 1/(3*s+2), 1/(3*s+1), 1/(3*s+1), 2/(3*s+2) )
    xbar <- c(s/(s+1), s/(s+2), 3*s/(3*s+2), 3*s/(3*s+1), 3*s/(3*s+1),
                3*s/(3*s+2))

    var.g <- cumsum(hazard*hazard /c(1,1,1,1,1,2))
    var.d <- cumsum( (xbar-newx)*hazard)

    surv <- exp(-cumsum(hazard) * exp(coef*newx))
    varhaz <- (var.g + var.d^2/imat)* exp(2*coef*newx)

    list(loglik=loglik, u=u, imat=imat, hazard=hazard,
            xbar=xbar, surv=surv, varhaz=varhaz, var.g=var.g, var.d=var.d)
    }

byhand3 <- function(coef) {
    #Hard coded -- what is found in the Agreg.3 comments file
    s <- as.vector(exp(coef))  #kill the names attr

    imat <- s/(s+1)^2 + 2*s/(s+2)^2 + 6*s/(3*s+2)^2 +
            3*s/(3*s+1)^2 + 3*s/(3*s+1)^2 + 12*s/(3*s+2)^2

    hazard <-c( 1/(s+1), 1/(s+2), 1/(3*s+2), 1/(3*s+1), 1/(3*s+1), 2/(3*s+2) )
    xbar <- c(s/(s+1), s/(s+2), 3*s/(3*s+2), 3*s/(3*s+1), 3*s/(3*s+1),
                3*s/(3*s+2))


    newx <- c(0,0,1,1,1,0,0, 2,2,2,2)
    wt <- exp(coef*newx)
    indx <- c(1,2,4,5,6,1,2,3,4,5,6)

    var.g <- hazard*hazard /c(1,1,1,1,1,2)
    surv <- exp(-cumsum(hazard[indx]*wt))
    var1 <- cumsum(var.g[indx]*wt*wt)
    d    <- cumsum( (xbar[indx] - newx)* hazard[indx] * wt)
    var2 <- d^2/imat
    names(surv) <- names(var1) <-names(var2) <- NULL
    list(time= c(2,3,7,8,9,12,13,16,17,18,19),
         surv=surv, std= sqrt(var1 + var2), var.g=var1, var.d=var2, d=d,
                hazard=hazard, wt=wt)
    }

# Create the simplest test data set
#
test1 <- data.frame(time=  c(4, 3,1,1,2,2,3),
    status=c(1,NA,1,0,1,1,0),
    x=     c(0, 2,1,1,1,0,0))
#
#   Do the test on the simple data set
#
temp <- Surv(test1$time, test1$status)
fit <-coxph(temp~ test1$x, method='breslow')
fit
fit0 <-coxph(temp~ test1$x, iter=0)
fit0$coef
coxph(temp~ test1$x, iter=1, method='breslow')$coef
coxph(temp~ test1$x, iter=2, method='breslow')$coef
coxph(temp~ test1$x, iter=3, method='breslow')$coef

coxph(Surv(time, status) ~ x, test1, method='efron')
coxph(Surv(time, status) ~ x, test1, method='exact')

resid(fit0)
resid(coxph(temp~1))
resid(fit0, 'scor')
resid(fit0, 'scho')

resid(fit)
resid(fit, 'scor')
resid(fit, 'scho')

predict(fit, type='lp')
predict(fit, type='risk')
predict(fit, type='expected')
#predict(fit, type='terms')
predict(fit, type='lp', se.fit=T)
predict(fit, type='risk', se.fit=T)
predict(fit, type='expected', se.fit=T)
#predict(fit, type='terms', se.fit=T)

summary(survfit(fit))
summary(survfit(fit, list(test1=list(x=2))))  ### ouch
#
# Create the simple data for agreg
#

test2 <- data.frame(start=c(1, 2, 5, 2, 1, 7, 3, 4, 8, 8),
                    stop =c(2, 3, 6, 7, 8, 9, 9, 9,14,17),
                    event=c(1, 1, 1, 1, 1, 1, 1, 0, 0, 0),
                    x    =c(1, 0, 0, 1, 0, 1, 1, 1, 0, 0) )
#
#   Do the test on the simple agreg data set
#
temp <- Surv(test2$start, test2$stop, test2$event)
fit <-coxph(temp~ x, test2, method='breslow')
fit
fit0 <-coxph(temp~ x, test2, iter=0)
fit0$coef
coxph(temp~ x, test2, iter=1, method='breslow')$coef
coxph(temp~ x, test2, iter=2, method='breslow')$coef
coxph(temp~ x, test2, iter=3, method='breslow')$coef

coxph(temp ~ test2$x, method='efron')
coxph(temp ~ test2$x, method='exact')

resid(fit0)
resid(fit0, 'scor')
resid(fit0, 'scho')

resid(fit)
resid(fit, 'scor')
resid(fit, 'scho')

resid(coxph(temp~ x, test2, iter=0, init=log(2)), 'score')

sfit <-survfit(fit)
sfit
summary(sfit)

temp <- Surv(rep(test2$start,2), rep(test2$stop,2), rep(test2$event,2))
ss <- rep(0:1, rep(length(test2$x),2))
options(contrasts=c("contr.treatment", "contr.treatment"))
fitx <- coxph(temp ~ c(test2$x, test2$x^2)*strata(ss), method='breslow')
sfit <- survfit(fitx, c(fitx$means[1], 0) )
sfit
summary(sfit)
#
# Even though everyone in strata 1 has x2==0, I won't get the same survival
#  curve above if survfit is called without forcing predicted x2 to be
#  zero-- otherwise I am asking for a different baseline than the
#  simple model did.  In this particular case coef[2] is nearly zero, so
#  the curves are the same, but the variances differ.
#

# This mimics 'byhand3' and the documentation
fit <- coxph(Surv(start, stop, event) ~x, test2, method='breslow')
tdata <- data.frame( start=c(0,20, 6,0,5),
                     stop =c(5,23,10,5,15),
                     event=rep(0,5),
                     x=c(0,0,1,0,2) )

temp <- survfit(fit, tdata, individual=T)
temp2 <- byhand3(fit$coef)
all.equal(temp$surv, temp2$surv)
all.equal(temp2$std, temp$std.err)

temp2
#
# The AML data, from Miller, "Survival Analysis", page 49.
#
aml <- data.frame(time=   c( 9, 13, 13, 18, 23, 28, 31, 34, 45, 48, 161,
                           5,  5,  8,  8, 12, 16, 23, 27, 30, 33, 43, 45),
            status= c( 1,1,0,1,1,0,1,1,0,1,0, 1,1,1,1,1,0,1,1,1,1,1,1),
            x     = as.factor(c(rep("Maintained", 11),
                                rep("Nonmaintained", 12) )))

#
# These results can be found in Miller
#
fit <- coxph(Surv(aml$time, aml$status) ~ aml$x, method='breslow')
fit
resid(fit, type='mart')
resid(fit, type='score')
resid(fit, type='scho')

fit <- survfit(Surv(aml$time, aml$status) ~ aml$x)
fit
summary(fit)
survdiff(Surv(aml$time, aml$status)~ aml$x)

#
# Test out the weighted K-M
#
#  First, equal case weights- shouldn't change the survival, but will
#    halve the variance
temp2 <-survfit(Surv(aml$time, aml$status), type='kaplan', weight=rep(2,23))
temp  <-survfit(Surv(time, status)~1, aml)
temp$surv/temp2$surv
(temp$std.err/temp2$std.err)^2

# Risk weights-- use a null Cox model
summary(survfit(coxph(Surv(aml$time, aml$status) ~ offset(log(1:23)))))

# Lots of ties, so its a good test case
x1 <- coxph(Surv(time, status)~x, aml, method='efron')
x1
x2 <- coxph(Surv(rep(0,23),time, status) ~x, aml, method='efron')
x1$coef - x2$coef

#
# Trivial test of stratified residuals
#   Make a second strata = replicate of the first, and I should get the
#   exact same answers

temp <- as.matrix(test1)
dimnames(temp)[[1]]<-as.character(1:7)
n    <- nrow(temp)
ndead<- sum(test1$status[!is.na(test1$status)])
temp <- data.frame(rbind(temp, temp)) #later releases of S have rbind.data.frame
names(temp)<-names(test1)
tstrat <- rep(1:2, c(n,n))

fit1 <- coxph(Surv(time, status) ~x, test1)
fit2 <- coxph(Surv(time, status) ~x + strata(tstrat), temp)

all.equal(resid(fit1) , (resid(fit2))[1:n])
all.equal(resid(fit1, type='score') , (resid(fit2, type='score'))[1:n])
all.equal(resid(fit1, type='schoe') , (resid(fit2, type='schoe'))[1:ndead])


#AG model
temp <- as.matrix(test2)
dimnames(temp)[[1]]<-as.character(1:(dim(temp)[1]))
n    <- nrow(temp)
ndead<- sum(test2$event[!is.na(test2$event)])
temp <- data.frame(rbind(temp, temp))
names(temp)<-names(test2)
tstrat <- rep(1:2, c(n,n))

fit1 <- coxph(Surv(start, stop, event) ~x, test2)
fit2 <- coxph(Surv(start, stop, event) ~x + strata(tstrat), temp)

all.equal(resid(fit1) , (resid(fit2))[1:n])
all.equal(resid(fit1, type='score') , (resid(fit2, type='score'))[1:n])
all.equal(resid(fit1, type='schoe') , (resid(fit2, type='schoe'))[1:ndead])
#
# A test to exercise the "infinity" check on 2 variables
#
test3 _ data.frame(futime=1:12, fustat=c(1,0,1,0,1,0,0,0,0,0,0,0),
                   x1=rep(0:1,6), x2=c(rep(0,6), rep(1,6)))
coxph(Surv(futime, fustat) ~ x1 + x2, test3)
# Create a "counting process" version of the simplest test data set
#
test1b<- data.frame(start= c(0, 3,  0,  0, 5,  0, 6,14,  0,  0, 10,20,30, 0),
              stop = c(3,10, 10,  5,20,  6,14,20, 30,  10,20,30,40, 10),
              status=c(0, 1,  0,  0, 1,  0, 0, 1,  0,   0, 0, 0, 1,  0),
              x=     c(1, 1,  1,  1, 1,  0, 0, 0,  0,   0, 0, 0, 0,  NA),
              id =   c(3, 3,  4,  5, 5,  6, 6, 6,  7,   1, 1, 1, 1,   2))
#
#  Check out the various residuals under an Efron approximation
#
fit0 <- coxph(Surv(time, status)~ x, test1, iter=0)
fit  <- coxph(Surv(time, status) ~x, test1)
fit0b <- coxph(Surv(start, stop, status) ~ x, test1b, iter=0)
fitb  <- coxph(Surv(start, stop, status) ~x, test1b)
fitc  <- coxph(Surv(time, status) ~ offset(fit$coef*x), test1)
fitd  <- coxph(Surv(start, stop, status) ~ offset(fit$coef*x), test1b)

resid(fit0)
resid(fit0b, collapse=test1b$id)
resid(fit)
resid(fitb, collapse=test1b$id)
resid(fitc)
resid(fitd, collapse=test1b$id) 
residuals.coxph(fitd, collapse=test1b$id)

resid(fit0, type='score')
resid(fit0b, type='score', collapse=test1b$id)
resid(fit, type='score')
resid(fitb, type='score', collapse=test1b$id)

resid(fit0, type='scho')
resid(fit0b, type='scho', collapse=test1b$id)
resid(fit, type='scho')
resid(fitb, type='scho', collapse=test1b$id)

summary(survfit(fit0, list(x=0))) 
summary(survfit(fit, list(x=0)))
summary(survfit(fitb,list(x=0)))
summary(survfit(fit))
# Tests of the weighted Cox model
#
# Similar data set to test1, but add weights,
#                                    a double-death/censor tied time
#                                    a censored last subject
# The latter two are cases covered only feebly elsewhere.
testw1 <- data.frame(time=  c(1,1,2,2,2,2,3,4,5),
                    status= c(1,0,1,1,1,0,0,1,0),
                    x=      c(2,0,1,1,0,1,0,1,0),
                    wt =    c(1,2,3,4,3,2,1,2,1))

fit0 <- coxph(Surv(time, status) ~x, testw1, weights=wt,
                    method='breslow', iter=0)
fit  <- coxph(Surv(time, status) ~x, testw1, weights=wt, method='breslow')
fit0
summary(fit)
resid(fit0, type='mart')
resid(fit0, type='score')
resid(fit0, type='scho')
fit0 <- coxph(Surv(time, status) ~x,testw1, weights=wt, iter=0)
resid(fit0, 'mart')
resid(coxph(Surv(time, status) ~1, testw1, weights=wt))  #Null model


resid(fit, type='mart')
resid(fit, type='score')
resid(fit, type='scho')

fit  <- coxph(Surv(time, status) ~x, testw1, weights=wt, method='efron')
fit
resid(fit, type='mart')
resid(fit, type='score')
resid(fit, type='scho')
# Tests of the weighted Cox model, AG form of the data
#   Same solution as doweight1.s
#
testw2 <- data.frame(id  =  c( 1, 1, 2, 3, 3, 3, 4, 5, 5, 6, 7, 8, 8, 9),
                     begin= c( 0, 5, 0, 0,10,15, 0, 0,14, 0, 0, 0,23, 0),
                     time=  c( 5,10,10,10,15,20,20,14,20,20,30,23,40,50),
                    status= c( 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0),
                    x=      c( 2, 2, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0),
                    wt =    c( 1, 1, 2, 3, 3, 3, 4, 3, 3, 2, 1, 2, 2, 1))

fit0 <- coxph(Surv(begin,time, status) ~x, testw2, weights=wt,
                    method='breslow', iter=0)
fit  <- coxph(Surv(begin,time, status) ~x, testw2, weights=wt, method='breslow')
fit0
summary(fit)
resid(fit0, type='mart', collapse=testw2$id)
resid(fit0, type='score', collapse=testw2$id)
resid(fit0, type='scho')

resid(fit, type='mart', collapse=testw2$id)
resid(fit, type='score', collapse=testw2$id)
resid(fit, type='scho')
fit0 <- coxph(Surv(begin, time, status) ~x,testw2, weights=wt, iter=0)
resid(fit0, 'mart', collapse=testw2$id)
resid(coxph(Surv(begin, time, status) ~1, testw2, weights=wt) ,collapse=testw2$id)  #Null model 
residuals.coxph(coxph(Surv(begin, time, status) ~1, testw2, weights=wt) ,collapse=testw2$id)  #Null model 

fit  <- coxph(Surv(begin,time, status) ~x, testw2, weights=wt, method='efron')
fit
resid(fit, type='mart', collapse=testw2$id)
resid(fit, type='score', collapse=testw2$id)
resid(fit, type='scho')

#
# Read in the ovarian data
#

ovarian <- read.table("data.ovarian", row.names=NULL,
            col.names= c("futime", "fustat", "age", "resid.ds", "rx", "ecog.ps"))
#
# Test the coxph program on the Ovarian data
#

xx _ order(ovarian$futime)   #put data in same order as SAS green book
temp <- ovarian[xx,]
attach(temp)

# List the data
temp

summary(survfit(Surv(futime, fustat)), censor=T)

# Various models
coxph(Surv(futime, fustat)~ age)
coxph(Surv(futime, fustat)~ resid.ds)
coxph(Surv(futime, fustat)~ rx)
coxph(Surv(futime, fustat)~ ecog.ps)

coxph(Surv(futime, fustat)~ resid.ds + rx + ecog.ps)
coxph(Surv(futime, fustat)~ age + rx + ecog.ps)
coxph(Surv(futime, fustat)~ age + resid.ds + ecog.ps)
coxph(Surv(futime, fustat)~ age + resid.ds + rx)

# Residuals
fit <- coxph(Surv(futime, fustat)~ age + resid.ds + rx + ecog.ps )
resid(fit)
resid(fit, 'dev')
resid(fit, 'scor')
resid(fit, 'scho')

fit <- coxph(Surv(futime, fustat) ~ age + ecog.ps + strata(rx))
summary(fit)
summary(survfit(fit))
sfit <- survfit(fit, list(age=c(30,70), ecog.ps=c(2,3)))  #two columns
sfit
summary(sfit)
detach(pos=2)


# Test the robust=T option of coxph
fit <- coxph(Surv(futime, fustat) ~ age + ecog.ps + rx, ovarian, robust=T)
rr <- resid(fit, type='dfbeta')
all.equal(as.vector(t(rr) %*% rr), as.vector(fit$var))
#temp _ scan("data.jasa", what=list(id=0, b.mo=0, b.d=0, b.y=0,
#                                      a.mo=0, a.d=0, a.y=0,
#                                      t.mo=0, t.d=0, t.y=0,
#                                      f.mo=0, f.d=0, f.y=0,
#                                      fu.stat=0, surg=0, mismatch=0,
#                                      hla.a2=0, mscore=0, reject=0))
temp _ read.table("data.jasa")
names(temp)<-c("id", "b.mo", "b.d", "b.y","a.mo","a.d", "a.y","t.mo", "t.d", "t.y","f.mo", "f.d", "f.y","fu.stat", "surg", "mismatch","hla.a2", "mscore", "reject")
temp3 _ mdy.date(temp$b.mo, temp$b.d, temp$b.y)
temp4 _ mdy.date(temp$a.mo, temp$a.d, temp$a.y)
temp5 _ mdy.date(temp$t.mo, temp$t.d, temp$t.y)
temp6 _ mdy.date(temp$f.mo, temp$f.d, temp$f.y)

jasa <- list( birth.dt=temp3, accept.dt=temp4, tx.date=temp5,
            fu.date=temp6, fustat=temp$fu.stat,
            surgery = temp$surg,  age= temp4-temp3,
            futime = 1+temp6 - temp4, wait.time= 1+temp5 - temp4,
            transplant = c(!is.na(temp5)),
            mismatch=temp$mismatch, hla.a2=temp$hla.a2,
            mscore = temp$mscore, reject=temp$reject)

class(jasa) <- 'data.frame'
attr(jasa, 'row.names') <- temp$id

# The "1+" above causes us to match the analysis in Kalbfleisch and Prentice.
#  Someone accepted and transplanted on the same day is assumed to have one
#  day under the non-transplant risk.
expect <- survexp(futime ~ ratetable(age=(accept.dt - birth.dt), sex=1,
        year=accept.dt), jasa, cohort=F, ratetable=survexp.uswhite)

survdiff(Surv(jasa$futime, jasa$fustat) ~ offset(expect))
# Do a Stanford heart transplant data the way that K&P do
#
# Input - a data frame containing the raw data
# Output- a data frame with multiple obs for the transplanted subjects
#
#  There are more efficient ways to do this than the "for" loop below, but
#    this script is much more readable than any of them.
#
stan1 <- function(jasa) {
    id <- row.names(jasa)
    tx _ jasa$transplant ==1
    covar _ cbind(jasa$age/365.25 -48,
                  (jasa$accept.dt - mdy.date(10,1,1967))/365.25,
                  jasa$surgery)
    n _ length(tx)     #number in study
    ntx _ sum(tx)      #number transplanted
    # Per paragraph 2, p138, the patient who died on the day of transplant is
    #   treated differently, for all others deaths are assumed to occur "earlier
    #   in the day" than transplants.
    special <- id ==38
    wait _ jasa$wait.time
    wait[special] _ wait[special] - .5

    age <- year <- surgery <- transplant <- id2 <- double(n+ntx)
    start <- stop <- event <- double(n+ntx)
    ii <- 1
    for (i in 1:n) {
        age[ii] <- covar[i,1]
        year[ii] <- covar[i,2]
        surgery[ii] <- covar[i,3]
        transplant[ii] <- 0
        id2[ii] <- id[i]

        if (tx[i])  { #transplanted  - 2 lines if data
            start[ii] <- 0
            stop[ii]  <- wait[i]
            event[ii] <- 0

            ii <- ii+1
            start[ii] <- wait[i]
            stop[ii]  <- jasa$futime[i]
            event[ii] <- jasa$fustat[i]
            age[ii] <- covar[i,1]
            year[ii] <- covar[i,2]
            surgery[ii] <- covar[i,3]
            transplant[ii] <- 1
            id2[ii] <- id[i]
            }
        else {                             # one line of data
            start[ii] <-0
            stop[ii] <- jasa$futime[i]
            event[ii]<- jasa$fustat[i]
            }
        ii <- ii+1
        }

    data.frame(start, stop, event, age, year, surgery,
                    transplant=factor(transplant), id=id2)
    }

jasa1 _ stan1(jasa)
names(jasa1)<-c("start", "stop", "event", "age", "year", "surgery","transplant","id")
attach(jasa1)
# Now fit the 6 models found in Kalbfleisch and Prentice, p139
options(contrasts=c("contr.treatment", "contr.treatment"))
sfit.1 _ coxph(Surv(start, stop, event)~ (age + surgery)*transplant,
                        method='breslow')
sfit.2 _ coxph(Surv(start, stop, event)~ year*transplant,
                                method='breslow')
sfit.3 _ coxph(Surv(start, stop, event)~ (age + year)*transplant,
                                method='breslow')
sfit.4 _ coxph(Surv(start, stop, event)~ (year +surgery) *transplant,
                                method='breslow')
sfit.5 _ coxph(Surv(start, stop, event)~ (age + surgery)*transplant + year ,
                                method='breslow')
sfit.6 _ coxph(Surv(start, stop, event)~ age*transplant + surgery + year,
                                method='breslow')

summary(sfit.1)
sfit.2
summary(sfit.3)
sfit.4
sfit.5
sfit.6

# Survival curve for the "average" subject
summary(survfit(sfit.1))

# Survival curve for a subject of age 50, with prior surgery, tx at 6 months
data <- data.frame(start=c(0,183), stop=c(183,3*365), event=c(1,1),
                   age=c(50,50),  surgery=c(1,1), transplant=c(0,1))
summary(survfit(sfit.1, data, individual=T))

# These should all give the same answer
j.age <- jasa$age/365.25 -48
fit1 <- coxph(Surv(futime, fustat) ~ j.age, data=jasa)
fit2 <- coxph(Surv(futime, fustat) ~ j.age, jasa, init=fit1$coef, iter=0)
fit3 <- coxph(Surv(start, stop, event) ~ age)
fit4 <- coxph(Surv(start, stop, event) ~ offset((age-fit3$means)*fit1$coef))
s1 <- survfit(fit1, fit3$means)
s2 <- survfit(fit2, fit3$means)
s3 <- survfit(fit3)
s4 <- survfit(fit4)

all.equal(s1$surv, s2$surv)
all.equal(s1$surv, s3$surv)
all.equal(s1$surv, s4$surv)

# Still the same answer, fit multiple strata at once
#  Strata 1 has independent coefs of strata 2, so putting in
#    the other data should not affect it
ll <- length(start)
ss <- rep(0:1, c(ll,ll))
fit <- coxph(Surv(rep(start,2), rep(stop,2), rep(event,2)) ~
                        rep(age,2)*strata(ss) + I(rep(age,2)^2*ss) )
fit$coef[1] - fit3$coef
s4 <- survfit(fit, c(fit3$means, 0,0))
all.equal(s4$surv[1:(s4$strata[1])],  s3$surv)
detach("jasa1")

# A subset of some local data on lung cancer patients.  A useful internal test
#  because it has multiple strata and lots of missing values.
#

# inst = enrolling institution
# sex  1=male  2=female
# ph.ecog  physician's estimate of the ECOG performace score.  0=fully active,
#              4=bedridden
# ph.karno  physician's estimate of the Karnofsky score, a competitor to the
#              ECOG ps.
# pat.karno  patient's assesment of his/her Karnofsky score
# meal.cal   # calories consumed at meals (exclude beverages and snacks)
# wt.loss    weight loss in the last 6 months

cancer <- read.table("data.cancer")
names(cancer)<- c("inst", "time", "status", "age", "sex", "ph.ecog", "ph.karno", "pat.karno", "meal.cal", "wt.loss")
#cancer <- data.frame(cancer)
#
# Test out all of the routines on a more complex data set
#
attach(cancer)
stime <- Surv(time, status)
temp <- survfit(stime ~ ph.ecog)
summary(temp, times=c(30*1:11, 365*1:3))
print(temp[2:3])

temp <- survfit(stime, type='fleming',
                   conf.int=.9, conf.type='log-log', error='tsiatis')
summary(temp, times=30 *1:5)

temp <- survdiff(stime ~ inst, rho=.5)
print(temp, digits=6)

temp <- coxph(stime ~ ph.ecog + ph.karno + pat.karno + wt.loss + sex +age + meal.cal + strata(inst))
summary(temp)
cox.zph(temp)
cox.zph(temp, transform='identity')

coxph(Surv(rep(0,length(time)), time, status) ~ ph.ecog + ph.karno + pat.karno
                + wt.loss + sex + age + meal.cal + strata(inst))
#detach(w=2)
detach(pos=2)
bladder <- read.table('data.bladder4',
             col.names=c('id', 'rx', 'number', 'size', 'start', 'stop',
                                'event', 'enum'))

bladder2 <- bladder[bladder$start< bladder$stop, ]
bladder$start <- NULL
bladder <- bladder[bladder$enum<5,]
#
# Fit the models found in Wei et. al.
#
wfit <- coxph(Surv(stop, event) ~ (rx + size + number)* strata(enum) +
                 cluster(id), bladder, method='breslow')
wfit

# Check the rx coefs versus Wei, et al, JASA 1989
rx <- c(1,4,5,6)  # the treatment coefs above
cmat <- diag(4); cmat[1,] <- 1;          #contrast matrix
wfit$coef[rx] %*% cmat                   # the coefs in their paper (table 5)
t(cmat) %*% wfit$var[rx,rx] %*% cmat  # var matrix (eqn 3.2)

# Anderson-Gill fit
fita <- coxph(Surv(start, stop, event) ~ rx + size + number + cluster(id),
                  bladder2,  method='breslow')
fita

# Prentice fits.  Their model 1 a and b are the same
fit1p  <- coxph(Surv(stop, event) ~ rx + size + number, bladder2,
                subset=(bladder2$enum==1), method='breslow')
fit2pa <- coxph(Surv(stop, event) ~ rx + size + number, bladder2,
                subset=(bladder2$enum==2), method='breslow')
fit2pb <- coxph(Surv(stop-start,  event) ~ rx + size + number, bladder2,
                   subset=(bladder2$enum==2), method='breslow')
fit3pa <- coxph(Surv(stop, event) ~ rx + size + number, bladder2,
                subset=(bladder2$enum==3), method='breslow')
 #and etc.
fit1p
fit2pa
fit2pb
fit3pa
#  Tests of expected survival


#
# Simple case 1: a single male subject, born 6/6/36 and entered on study 6/6/55.
#
#  Compute the 1, 5, 10 and 12 year expected survival

temp1 <- mdy.date(6,6,36)
temp2 <- mdy.date(6,6,55)
exp1 <- survexp(~ratetable(year=temp2, age=(temp2-temp1), sex=1),
                    ratetable=survexp.uswhite,times=c(366, 1827, 3653, 4383))

# Well, almost easy.  The uswhite table assumes that someone age 19 will have
#   seen 5 leap years-- but this lad has only seen 4.  Thus the routine sees
#   him as 1 day shy of his birthday.
#   (The age 19 category starts at 6940, but he is 6939 days old at entry).
# Thus his passage through the table is a bit more complicated

# First epoch:  1 day  at age 18, 1954    6/6/55
#             365 days at age 19, 1955    6/7/55 - 6/5/56

# Second      365 days at age 20, 1956    6/6/56 - 6/5/57
#             365 days at age 21, 1957    6/6/57 - 6/5/58
#             366 days at age 22, 1958    6/6/58 - 6/6/59
#             365 days at age 23, 1959    6/7/59 - 6/5/60

# Third       365 days at age 24, 1960    6/6/60 - 6/5/61
#             365 days at age 25, 1961    6/6/61 - 6/5/62
#             366 days at age 26, 1962    6/6/62 - 6/6/63
#             365 days at age 27, 1963    6/7/63 - 6/5/64
#             365 days at age 28, 1964    6/6/64 - 6/5/64

# Fourth      365 days at age 29, 1965    6/6/65 - 6/5/66
#             365 days at age 30, 1966    6/6/66 - 6/5/67

# remember, a first subscript of "1" is for age 0

xx <- survexp.uswhite[,1,]
check <- c(       (.6*xx[19,1] + .4*xx[19,2])    +
             365* (.5*xx[20,1] + .5*xx[20,2]) ,

             365* (.4*xx[21,1] + .6*xx[21,2]) +
             365* (.3*xx[22,1] + .7*xx[22,2]) +
             366* (.2*xx[23,1] + .8*xx[23,2]) +
             365* (.1*xx[24,1] + .9*xx[24,2]) ,

             365* (   xx[25,2]              ) +
             365* (.9*xx[26,2] + .1*xx[26,3]) +
             366* (.8*xx[27,2] + .2*xx[27,3]) +
             365* (.7*xx[28,2] + .3*xx[28,3]) +
             365* (.6*xx[29,2] + .4*xx[29,3]) ,

             365* (.5*xx[30,2] + .5*xx[30,3]) +
             365* (.4*xx[31,2] + .6*xx[31,3]) )

print(exp1$surv)
print(exp(-cumsum(check)))

# This does not pass the "all.equal" test.  Because of leap year (again),
#  the internal S code does not do exactly what is stated above.  US
#  rate tables are special: the entry for age 20 in 1950 is the probability
#  that someone who becomes 20 years old in 1950 will reach his 21st birthday.
#  In order to fit this into the general "cutpoints" calculations of
#  person-years, dates are adjusted so that everyone appears to have a
#  birthday on Jan 1.  But because of leap years, a birthday then moves to
#  Dec 31 sometimes --  this happens in each of the 366 day intervals above,
#  e.g., 1 day at age 22 with 1957 rates, 365 at age 22 with 1958 rates.
#  Upshot: they only agree to 6 decimals.  Close enough for me!

#  Tests of expected survival

#
# Simple case 2: make sure that the averages are correct, for Ederer method
#
#  Compute the 1, 5, 10 and 12 year expected survival

temp1 <- mdy.date(1:6,6:11,36:41)
temp2 <- mdy.date(6:1,11:6,55:50)
age <- temp2 - temp1

exp1 <- survexp(~ratetable(year=temp2, age=(temp2-temp1), sex=1),
               times=c(366, 1827, 3653, 4383))
exp2 <- survexp(~ratetable(year=temp2, age=(temp2-temp1), sex=1) + I(1:6),
               times=c(366, 1827, 3653, 4383))

##print(all.equal(exp1$surv, apply(exp2$surv, 1, mean)))
exp1$surv -  apply(exp2$surv, 1, mean)
#
# Check that adding more time points doesn't change things
#
exp3 <- survexp(~ratetable(year=temp2, age=(temp2-temp1), sex=1),
        times=sort(c(366, 1827, 3653, 4383, 30*(1:100))))

exp1$surv
exp3$surv[match(exp1$time, exp3$time, nomatch=0)]


#
# Now test Hakulinen's method, assuming an analysis date of 3/1/57
#
futime <- mdy.date(3,1,57) - temp2
xtime  <- sort(c(futime, 30, 60, 185, 365))

exp1 <- survexp(futime ~ ratetable(year=temp2, age=(temp2-temp1), sex=1),times=xtime, conditional=F)
exp2 <- survexp(~ratetable(year=temp2, age=(temp2-temp1), sex=1) + I(1:6),times=futime)

wt <- rep(1,6)
con <- double(6)
for (i in 1:6) {
    con[i] <- sum(exp2$surv[i,i:6])/sum(wt[i:6])
    wt <- exp2$surv[i,]
    }

exp1$surv[match(futime, xtime)]
cumprod(con)                 #should be equal


#
# Now for the conditional method
#
exp1 <- survexp(futime ~ ratetable(year=temp2, age=(temp2-temp1), sex=1),
              times=xtime, conditional=T)

cond <- exp2$surv
for (i in 6:2) cond[i,] _ (cond[i,]/cond[i-1,])  #conditional survival
for (i in 1:6) con[i] _ exp(mean(log(cond[i, i:6])))

exp1$surv[match(futime, xtime)]-cumprod(con)
cumprod(con)

# 
# Simple case: a single male subject, born 6/6/36 and entered on study 6/6/55.
#

temp1 <- mdy.date(6,6,36)
temp2 <- mdy.date(6,6,55)# Now compare the results from person-years
#
temp.age <- tcut(temp2-temp1, floor(c(-1, (18:31 * 365.24))),
        labels=c('0-18', paste(18:30, 19:31, sep='-')))
temp.yr  <- tcut(temp2, mdy.date(1,1,1954:1965), labels=1954:1964)
temp.time <- 3700   #total days of fu
py1 <- pyears(temp.time ~ temp.age + temp.yr, scale=1) #output in days

# The subject should appear in 20 cells
#    6/6/55 - 12/31/55, 209 days, age 19-20, 1955
#    1/1/56 -  6/ 4/56, 156 days, age 19-20, 1956
#    6/5/56 - 12/31/56, 210 days, age 20-21, 1956   (a leap year, and his
#                       birthday computes one day earlier)
#    1/1/57 -  6/ 5/57, 156 days, age 20-21, 1957
#    6/6/57 - 12/31/57, 209 days, age 21-22, 1957
# and etc
#   with 203 days "off table", ie, beyond the last cell of the table
#
# It is a nuisance, but tcut follows 'cut' in that we give the ENDS of
#  the intervals, whereas the survival tables use the starts of intervals.
#  Thus this breakdown does not match that in doexpect.s
#
xx <- matrix(0, nrow=14, ncol=11)
xx[cbind(3:11, 3:11)] <- 156
xx[cbind(3:12, 2:11)] <- c(209, 210, rep(c(209, 209, 209, 210),2))
dimnames(xx) <- list(c('0-18', paste(18:30, 19:31, sep='-')), 1954:1964)
all.equal(xx, py1$pyears)
all.equal(203, py1$offtable)
all.equal(1*(xx>0), py1$n)

#
# Now with expecteds
#
py2 <- pyears(temp.time ~ temp.age + temp.yr
                + ratetable(age=temp2-temp1, year=temp2, sex=1),
             scale=1, ratetable=survexp.uswhite ) #output in days
all.equal(xx, py2$pyears)
all.equal(203, py2$offtable)
all.equal(1*(xx>0), py2$n)


py3 <-  pyears(temp.time ~ temp.age + temp.yr
                + ratetable(age=temp2-temp1, year=temp2, sex=1),
             scale=1, ratetable=survexp.uswhite , expect='pyears')
all.equal(py2$n, py3$n)
all.equal(py2$pyear, py3$pyear)
all.equal(py3$n, 1*(py3$expect>0))

# Now, compute the py3 result "by hand".  Since there is only one person
#   it can be derived from py2.
#
xx1 <- py2$expect[py2$n>0]              # the hazard over each interval
cumhaz <- cumsum(c(0, xx1[-length(xx1)]))     # the cumulative hazard   
xx2 <- py3$expect[py3$n>0]              # the expected number of person days
xx3 <- py3$pyears[py3$n>0]              # the potential number of person days

# This is the integral of the curve "exp(-haz *t)" over the interval
integral <- xx3 * exp(-cumhaz)* (1- exp(-xx1))/ xx1
# They might not be exactly equal, since the C code tracks changes in the
#   rate tables that occur -within- an interval.  So try for 7 digits
all.equal(round(integral,4), round(xx2,4))

# Cut off the bottom of the table, instead of the side
temp.age <- tcut(temp2-temp1, floor(c(-1, (18:27 * 365.24))),
        labels=c('0-18', paste(18:26, 19:27, sep='-')))

py4 <- eval(py3$call)
all.equal(py4$pyear, py3$pyear[1:10,])
all.equal(py4$expect, py3$expect[1:10,])



# Run a test that can be verified using SAS's LIFEREG
#
fit1w <- survreg(Surv(time, status) ~x, test1, link='log', dist='extreme')
fit1w
summary(fit1w)

fit1e <- survreg(Surv(time, status) ~x, test1, link='log', dist='exp')
fit1e
summary(fit1e)

fit1l <- survreg(Surv(time, status) ~x, test1, link='log', dist='logistic')
fit1l
summary(fit1l)

fit1g <- survreg(Surv(time, status) ~x, test1, link='log', dist='gaussian')
summary(fit1g)
#
#  Do a test with the ovarian data
#
fitfw <- survreg(Surv(futime, fustat) ~ age + ecog.ps, ovarian,
        link='log', dist='extreme')
fitfw

fitfl <- survreg(Surv(futime, fustat) ~ age + ecog.ps, ovarian,
        link='log', dist='logistic')
fitfl

#flem2 <- scan("fleming.data2", what=list(ltime=0, rtime=0))
flem2<-read.table("fleming.data2")
names(flem2)<-c("ltime","rtime")
flsurv<- Surv(flem2$ltime, flem2$rtime, type='interval2')

fitfw2 <- survreg(flsurv ~ age + ecog.ps, ovarian,
         link='log', dist='extreme')
summary(fitfw2)

fitfl2 <- survreg(flsurv ~ age + ecog.ps, ovarian,
         link='log', dist='logistic')
summary(fitfl2)

fitfg2 <- survreg(flsurv ~ age + ecog.ps, ovarian,
         link='log', dist='gaussian')
summary(fitfg2)
#
# A simple test of the singular=ok option
#
test1 <- data.frame(time=  c(4, 3,1,1,2,2,3),
                    status=c(1,NA,1,0,1,1,0),
                    x=     c(0, 2,1,1,1,0,0))

temp <- rep(0:3, rep(7,4))

stest <- data.frame(start  = 10*temp,
                    stop   = 10*temp + test1$time,
                    status = rep(test1$status,4),
                    x      = c(test1$x+ 1:7, rep(test1$x,3)),
                    epoch  = rep(1:4, rep(7,4)))

fit1 <- coxph(Surv(start, stop, status) ~ x * factor(epoch), stest)
fit1$coef    # elements 2:4 should be NA
#
# Test some more features of surv.diff
#
#  First, what happens when one group is a dummy
#


#
# The AML data, with a third group of early censorings "tacked on"
#
aml3 <- data.frame(time=   c( 9, 13, 13, 18, 23, 28, 31, 34, 45, 48, 161,
                           5,  5,  8,  8, 12, 16, 23, 27, 30, 33, 43, 45,
                           1,  2,  2,  3,  3,  3,  4),
            status= c( 1,1,0,1,1,0,1,1,0,1,0, 1,1,1,1,1,0,1,1,1,1,1,1,
                                0,0,0,0,0,0,0),
            x     = as.factor(c(rep("Maintained", 11),
                                rep("Nonmaintained", 12), rep("Dummy",7) )))


# These should give the same result (chisq, df), but the second has an
#  extra group
survdiff(Surv(time, status) ~x, aml)
survdiff(Surv(time, status) ~x, aml3)


#
# Now a test of the stratified log-rank
#   There are no tied times within institution, so the coxph program
#   can be used to give a complete test
#
fit <- survdiff(Surv(time, status) ~ pat.karno + strata(inst), cancer)


cfit <- coxph(Surv(time, status) ~ factor(pat.karno) + strata(inst),
                cancer, iter=0)

tdata <- na.omit(cancer[,c('time', 'status', 'pat.karno', 'inst')])

temp1 <- tapply(tdata$status-1, list(tdata$pat.karno, tdata$inst), sum,na.rm=T)
temp1 <- ifelse(is.na(temp1), 0, temp1)
temp2 <- tapply(cfit$resid,  list(tdata$pat.karno, tdata$inst), sum,na.rm=T) 
temp2 <- ifelse(is.na(temp2), 0, temp2)

temp2 <- temp1 - temp2

#Now temp1=observed, temp2=expected
#all.equal(c(temp1), c(fit$obs))
#all.equal(c(temp2), c(fit$exp))
# except not in R since the orders are wrong.
##all.equal(fit$var[-1,-1], solve(cfit$var))
#max(fit$var[-1,-1]- solve(cfit$var))


q("no")







