---
title: "BSTZINB_vignette"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{BSTZINB_vignette}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
  %\VignetteDepends{CorrMixed, ape}
---

```{r, include = FALSE, warning=FALSE, message=FALSE}
knitr::opts_chunk$set(
  collapse = TRUE, warning = FALSE, message = FALSE,
  fig.width=4, fig.height=4, fig.align='center', 
  comment = "#>"
)
```

<!-- Install the R package from GitHub using `devtools` package in R. If you do not have `devtools` installed in your system, please install `devtools` first. See [their website](https://cran.r-project.org/web/packages/devtools/readme/README.html) for details on installing `devtools`. Once you have devtools installed in your system, use the following in R to install the R package `BSTZINB` in your system: -->

Install the R-package from CRAN using the `install.packages` function:

```{r, eval=FALSE}
install.packages('BSTZINB')
```

Load the R package using the `library()` function in R as below:

```{r setup}
library(BSTZINB)
```


The functions available in the package can be readily used on real data. But in absence of that, we shall first create a synthetic dataset and demonstrate the usage of the available functions on the said dataset here.

Say, we want to apply our method to a presence-absence of disease data in the counties of Iowa. First, we need to create the adjacency matrix for the counties in Iowa. To do so, we shall use the datasets `county.adjacency` and `USAcities` provided in the package as below:

```{r}
data(county.adjacency)
data(USAcities)

library(dplyr)
library(gt)
library(gtsummary)

IAcities <- USAcities %>% filter(state_id=="IA")
countyname <- unique(IAcities$county_name)
A <- get_adj_mat(county.adjacency,countyname,c("IA"))
m <- apply(A,1,sum)
```

Next, we will set some parameters up:

```{r}
set.seed(091017)
n <- nrow(A)			 # Number of spatial units
nis <- rep(16,n) 	 # Number of individuals per county; here it's balanced -- 12 per county per year
# Note: may need to lower proposal variance, s, below as n_i increases 
sid <- rep(1:n,nis)
tid <- rep(1:nis[1],n)
N <- length(sid) 		# Total number of observations
```

Next, we generate the spatial effects:

```{r}
library(spam)
# library(mvtnorm)
kappa <- 0.99			  	            # Spatial dependence parameter ~ 1 for intrinsic CAR
covm <- matrix(c(.5,.10,.10,-.10,
              .10,.15,.10,.10,
              .10,.10,.5,.10,
              -.10,.10,.10,.15),4,4)# Conditional Cov of phi1 and phi2 given phi_{-i} 
Q <- as.spam(diag(m)-kappa*A)			     
covphi <- solve(Q)%x%covm			        # 3n x 3n covariance of phis
phi <- rmvnorm(1,sigma=covphi)		    # 3n vector of spatial effects
phitmp <- matrix(phi,ncol=4,byrow=T)  # n x 3 matrix of spatial effects

true.phi1 <- phi1 <- phitmp[,1]-mean(phitmp[,1])   # n x 1 phi1 vector -- Centered
true.phi2 <- phi2 <- phitmp[,2]-mean(phitmp[,2])   # n x 1 phi2 vector, etc.
true.phi3 <- phi3 <- phitmp[,3]-mean(phitmp[,3])
true.phi4 <- phi4 <- phitmp[,4]-mean(phitmp[,4])
Phi1 <- rep(phi1,nis)
Phi2 <- rep(phi2,nis)
Phi3 <- rep(phi3,nis)
Phi4 <- rep(phi4,nis)
true.phi <- cbind(true.phi1,true.phi2,true.phi3,true.phi4)
```

Followed by the fixed effects and the covariates `X`:

```{r}
library(boot)
tpop <- USAcities %>% filter(state_id=="IA") %>% group_by(county_fips) %>% summarise(tpop=population) %>% .[,2] %>% unlist %>% as.numeric
x <- rep(log(tpop),nis)
x <- (x-mean(x))/sd(x)
# set.seed(2023); x = rnorm(N)
t <- tid / max(tid)
X <- cbind(1,x)                       # Design matrix, can add additional covariates (e.g., race, age, gender)
Xtilde <- cbind(1,x,sin(t))
p <- ncol(Xtilde)
```


Next, we set up the binomial and the count parts of the data:
```{r}
# Binomial part
true.alpha <- rep(0.1,p)
true.alpha[1:2] <- c(-0.25,0.25)
# true.alpha <- c(-0.25,0.25,-0.5,0.25)
eta1 <- Xtilde%*%true.alpha+Phi1+Phi2*t
pii <- inv.logit(eta1)        # 1-pr("structural zero")
u <- rbinom(N,1,pii)          # at-risk indicator
N1 <- sum(u)          
pstruct0 <- 1-mean(u)         # Proportion of structural zeros

# Count Part 
true.beta <- rep(0.1,p)
true.beta[1:2] <- c(.5,-.25)
tmp <- u
tmp[u==0] <- 0                       # If in structural group then set tmp to 0
nis1 <- tapply(tmp,sid,sum)          # Number of at risk observations in each county

eta2 <- Xtilde[u==1,]%*%true.beta+Phi3[u==1]+Phi4[u==1]*t[u==1] # Linear predictor for count part
true.r <- 1.25                       # NB dispersion 
psi <- exp(eta2)/(1+exp(eta2))       # Prob of success
mu <- true.r*psi/(1-psi)             # NB mean
```

Finally, we generate the data `y` and make it into a data frame.
```{r}
y <- rep(0,N)                           # Response
y[u==1] <- rnbinom(N1,size=true.r,mu=mu)# If at risk, draw from NB
pzero <- length(y[y==0])/N             # Proportion of zeros
# pzero
# pstruct0/pzero
# table(y)
# mean(mu)
# quantile(tapply(u,id,sum))         # Number of at-risk observations per county *for all 5 years*
# quantile(y)


this_dat <- data.frame(sid,tid,y,X)
head(this_dat)
tbl_summary(this_dat) %>% modify_header(label = "**Coefficients**")
```

Once we have generated the data, we shall now use the functions available in our package on this data.

First, we'll focus on summarizing and visualizing the data by producing spatial maps and other plots and summaries of the data. The following code generates a spatial map of cumulative deaths in the state of Iowa:

```{r}
USDmapCount(state.sel = c("IA"),
            dat = this_dat,
            scol  = 1,
            tcol  = 2,
            cname = countyname,
            uplim=150)
```


We can additionally make histograms and spaghetti plots and Moran's I:

```{r}
# blue histogram
tmp <- table(y)/N*100  # convert to %s (divide by N multiply by 100)
oldpar <- par(no.readonly = TRUE)
par(mar=c(3,3,1,1))
barplot(tmp, ylab="Percent",xlab="Count",col="lightblue")

# sphagetti plot
library(CorrMixed)
# Plot individual profiles + mean
Spaghetti.Plot(Dataset = this_dat, Outcome = y, Id=sid, Time = tid)

par(oldpar)

# Moran's I
library(ape)
library(reshape)
this_dat2 <- cast(this_dat,sid~tid,sum,value="y")
Moran.I(rowMeans(this_dat2),A)
this_dat2 %>% apply(2,function(w) Moran.I(w,A)) %>% lapply(function(list) list$p.value) %>% unlist
Moran.I(rowMeans(this_dat2),A)
```

We can visualize the death count at given times by using the `tsel` option:

```{r}
USDmapCount(state.sel = c("IA"),
            dat = this_dat,
            scol  = 1,
            tcol  = 2, tsel = 3,
            cname = countyname) ## Timepoint 3

USDmapCount(state.sel = c("IA"),
            dat = this_dat,
            scol  = 1,
            tcol  = 2, tsel = 6,
            cname = countyname) ## Timepoint 6

USDmapCount(state.sel = c("IA"),
            dat = this_dat,
            scol  = 1,
            tcol  = 2, tsel = 9,
            cname = countyname) ## Timepoint 9

USDmapCount(state.sel = c("IA"),
            dat = this_dat,
            scol  = 1,
            tcol  = 2, tsel = 12,
            cname = countyname) ## Timepoint 12
```


Now, we shall apply the different models on this data. We'll start by fitting the Bayesian Spatiotemporal Poisson (BSTP) model using the functions `BNB` and `BZINB`:

```{r}
res0 <- BNB(y, X, A, nchain=2, niter=50, nburn=10)
glimpse(res0)

res1 <- BZINB(y, X, A, nchain=2, niter=50, nburn=10)
glimpse(res1)


apply(res0$Beta,2,mean)
apply(res1$Beta,2,mean)
```

Next will be the Bayesian Spatiotemporal Negative Binomial (BSTNB) model using the function `BSTNB`:

```{r}
res2 <- BSTNB(y, X, A, nchain=2, niter=50, nburn=10)
glimpse(res2)
```

Followed by a Bayesian Spatiotemporal Zero Inflated Negative Binomial (BSTZINB) model with linear temporal trend using the `BSTZINB` function with option `LinearT` set to `TRUE`. We can check the convergence of different parameters for this model using the `conv.test` function supplied with the package. DIC can also be computed using the supplied `compute_ZINB_DIC` function.

```{r,fig.width=6,fig.height=3,fig.align='center'}
res3 <- BSTZINB(y, X, A, LinearT=TRUE, nchain=2, niter=50, nburn=10, nthin=1)
glimpse(res3)


conv.test(res3$Alpha) ## Convergence test for Alpha parameters
conv.test(res3$Beta) ## Convergence test for Beta parameters
conv.test(res3$R) ## Convergence test for R parameters

## Plotting the chains for the parameters
oldpar <- par(no.readonly = TRUE)
par(mfrow=c(1,2));plot(res3$R[,1],main="R");plot(res3$R[,2],main="R")
par(mfrow=c(1,2));plot(res3$Alpha[,1,1],type='o',main="Alpha");plot(res3$Alpha[,1,2],type='o',main="Alpha")
par(oldpar)
## Checking the estimated beta with the true beta
apply(res3$Beta,2,mean)
true.beta

## Computing DIC
compute_ZINB_DIC(y,res3,dim(res3$Beta)[1],2)
```


And, another BSTZINB model but with spline fitted non-linear temporal trend is fitted below by switching the `LinearT` to `FALSE`. Corresponding convergence tests, plots and DIC computation are also presented. We can compare the two DICs to pick the best fit model.

```{r,fig.width=6,fig.height=3,fig.align='center'}
res4 <- BSTZINB(y, X, A, LinearT=FALSE, nchain=2, niter=50, nburn=10)
glimpse(res4)

## Convergence tests
conv.test(res4$Alpha) ## Alpha parameters
conv.test(res4$Beta) ## Beta parameters
conv.test(res4$R) ## R parameters
oldpar <- par(no.readonly = TRUE)
par(mfrow=c(1,2));plot(res4$R[,1],main="R");plot(res4$R[,2],main="R")
par(mfrow=c(1,2));plot(res4$Alpha[,1,1],type='o',main="Alpha");plot(res4$Alpha[,1,2],type='o',main="Alpha")
par(oldpar)
## Posterior means for beta and R
apply(res4$Beta,2,mean)
mean(res4$R)

## DIC
compute_ZINB_DIC(y,res4,dim(res4$Beta)[1],2)
```

These results may be summarized using the supplied `ResultTableSummary` function. Here we summarize the output of the BSTZINB model with the linear time trend:

```{r}
ResultTableSummary(res3)
```

A more detailed summary can be obtained by using the function `ResultTableSummary2`. This functions takes the data (`y`, `X`, `A`, `nt`) as input, runs all of the above 4 models, computes DICs and checks convergences, and finally summarizes them:

```{r}
ResultTableSummary2(y, X, A, nchain=2, niter=50, nburn=10, nthin=1)
```

Next, we'll focus on the outputs of these models and visualizing them. First, we'll visualize a time-averaged Eta parameter in the following way:

```{r}
# Map Visualization : Time-averaged Eta
this_result_dat <- this_dat
this_result_dat$y <- apply(res4$Eta1,2,mean)
this_result_dat %>% head
USDmapCount(state.sel = c("IA"),
            dat = this_result_dat,
            scol  = 1,
            cname = countyname)
```

Next, we visualize random and fixed effects of Eta:

```{r}
# Map Visualization : Spatio-temporal Random Effect of Eta 
this_result_dat <- this_dat
Phi1 <- rep(apply(res4$PHI1,2,mean),nis)
Phi2 <- rep(apply(res4$PHI2,2,mean),nis)
tt <- this_dat$tid / max(this_dat$tid)
this_result_dat$y <- Phi1+Phi2*tt
USDmapCount(state.sel = c("IA"),
            dat = this_result_dat,
            scol  = 1,
            tcol  = 2,
            cname = countyname)

# Map Visualization : Temporal Fixed Effect of Eta 
this_result_dat <- this_dat
Phi1 <- rep(apply(res4$PHI1,2,mean),nis)
Phi2 <- rep(apply(res4$PHI2,2,mean),nis)
tt <- this_dat$tid / max(this_dat$tid)
this_result_dat$y <- apply(res4$Eta1,2,mean) - Phi1+Phi2*tt
USDmapCount(state.sel = c("IA"),
            dat = this_result_dat,
            scol  = 1,
            tcol  = 2,
            cname = countyname)
```

We can also map the time-averaged and time-specified probability of at-risk of disease:

```{r}
# Map Visualization : Time-averaged probability at risk of disease 
this_result_dat <- this_dat
this_result_dat$y <- inv.logit(apply(res4$Eta1,2,mean))
this_result_dat %>% head
USDmapCount(state.sel = c("IA"),
            dat = this_result_dat,
            scol  = 1,
            tcol  = 2,
            cname = countyname)

# Map Visualization : Time-specified probability at risk of disease 
this_result_dat <- this_dat
this_result_dat$y <- inv.logit(apply(res4$Eta1,2,mean))
this_result_dat %>% head
USDmapCount(state.sel = c("IA"),
            dat = this_result_dat,
            scol  = 1,
            tcol  = 2,
            tsel  = 1,
            cname = countyname)
```

Further summarization of the output can be done by plotting the probabilities of the top few counties by quantiles and by numbers using `qRankPar` and `qRankParTop` functions:
```{r}
# Bar Plot of vn-quantile Probability at Risk of disease
qRankPar(state.set=c("IA"),cname=countyname,bstfit=res3,vn=12,
         cex.title=18, cex.lab=12, cex.legend=12)

# Bar Plot of Top vn number of Probability at Risk of disease
qRankParTop(state.set=c("IA"),cname=countyname,bstfit=res3,vn=12,
            cex.title=18, cex.lab=12, cex.legend=12)
```

The non-linear time effect for abundance for BSTZINB outputs (non-smoothed and smoothed versions) can be plotted using the `TimetrendCurve` function:

```{r}
# Line Plot of Nonlinear Time effect abundance
TimetrendCurve(res4,cname=countyname,vn=3,smooth.mode=FALSE,
               cex.title=18, cex.lab=12, cex.legend=12)

# Line Plot of Nonlinear Time effect abundance (smoothed version)
TimetrendCurve(res3,cname=countyname,vn=5,smooth.mode=TRUE,
               cex.title=18, cex.lab=12, cex.legend=12)
```