--- title: "Analysis code" author: "Armin Rauschenberger" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Analysis code} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} --- **We obtained our results on 2025-05-XX using R version 4.5.0 (2025-04-11, "How About a Twenty-Six") running on a Dell Latitude 5440 laptop with a 13th Gen Intel(R) Core(TM) i7-1365U processor (1800 Mhz), 16 GB RAM, and the operating system Microsoft Windows 10 Enterprise. The total computation time was around XXX hours (excluding data download, without parallel computing).** # Working environment This vignette includes code chunks with a long computation time (e.g., analysing simulated and experimental data) and code chunks with a short computation time (e.g., generating figures and tables). The logical options `sim.app` and `fig.tab` determine whether the chunks for (i) running the simulation and application or (ii) generating figures and tables are evaluated, respectively. Running this vignette with `sim.app=FALSE` and `fig.tab=TRUE` means that figures and tables will be generated from previously obtained results. Running this vignette with `sim.app=TRUE` and `fig.tab=TRUE`and means that all results will be reproduced (including data download and data processing). It is also possible to execute individual chunks (e.g., for reproducing a specific figure or table), but then it is important to provide the required inputs (e.g., "Requires: file X in folder Y, execution of chunk Z"). ```{r eval,eval=TRUE} knitr::opts_chunk$set(echo=TRUE,eval=FALSE) sim.app <- FALSE # reproduce simulation and application? fig.tab <- FALSE # reproduce figures and tables? ``` This chunk verifies the working environment. The working directory, specified by the object `path`, must contain the R functions in "package/R/functions.R" as well as the folders "results" and "manuscript". Alternatively, the R functions can be loaded from the R package `sparselink`. This chunk also installs missing R packages from CRAN or Bioconductor. ```{r setup,eval=sim.app|fig.tab} path <- "C:/Users/arauschenberger/Desktop/sparselink" # LIH (Windows) #path <- "/Users/armin.rauschenberger/Desktop/LIH/sparselink" # LCSB (Mac) dir <- c("results","manuscript","package/R/functions.R") for(i in seq_along(dir)){ if(!dir.exists(file.path(path,dir[i]))&!file.exists(file.path(path,dir[i]))){ stop(paste0("Require folder/file'",dir[i],"'.")) } } source(file.path(path,"package/R/functions.R")) # Or load 'sparselink' package. inst <- rownames(utils::installed.packages()) pkgs <- c("knitr","rmarkdown","glmnet","BiocManager","mvtnorm","glmtrans","spls","xrnet") for(i in seq_along(pkgs)){ if(!pkgs[i]%in%inst){ utils::install.packages(pkgs[i]) } } pkgs <- c("recount3","edgeR") for(i in seq_along(pkgs)){ if(!pkgs[i]%in%inst){ BiocManager::install(pkgs[i]) } } blue <- "blue"; red <- "red" if(exists("sim.app")&exists("fig.tab")){ if(!sim.app&fig.tab){ files <- c("simulation_multiple.RData","simulation_transfer.RData","recount3_data.RData","explore_data.RData","application.RData") for(i in seq_along(files)){ if(!file.exists(file.path(path,"results",files[i]))){ stop("File",files[i],"is missing.") } } } } ``` # Methods This chunk generates the figure for the methods section. - Requires: execution of chunk `setup` - Execution time: $1$ second - Ensures: file `fig_flow.eps` in folder `manuscript` ```{r method_figure,eval=fig.tab} #<> grDevices::postscript(file=file.path(path,"manuscript","fig_flow.eps"),width=6,height=2.5,horizontal=FALSE,onefile=FALSE,paper="special") graphics::par(mfrow=c(1,1),mar=c(0,0,0,0)) graphics::plot.new() graphics::plot.window(xlim=c(-0.2,1.0),ylim=c(0.0,1.0)) cex <- 0.8 pos <- data.frame(left=0.2,right=0.8,top=0.8,centre=0.45,bottom=0.1) mar <- data.frame(vertical=0.08,horizontal=0.08,dist=0.04) graphics::text(labels=paste("problem",1:2),x=c(pos$left,pos$right),y=pos$top+2*mar$vertical,font=2,col=c(blue,red),cex=cex) graphics::text(labels=expression(hat(beta)["j,1"]^{init}),x=pos$left,y=pos$top,col=blue) graphics::text(labels=expression(hat(beta)["j,2"]^{init}),x=pos$right,y=pos$top,col=red) graphics::arrows(x0=rep(c(pos$left,pos$right),each=2),x1=rep(c(pos$left,pos$right),times=2)+c(-mar$horizontal,-mar$horizontal,mar$horizontal,mar$horizontal),y0=pos$top-mar$vertical,y1=pos$centre+mar$vertical,length=0.1,col=rep(c(blue,red),each=2),lwd=2) graphics::text(labels=expression(w["j,1"]^{int}),x=pos$left-mar$horizontal-mar$dist,y=pos$centre,col=blue) graphics::text(labels=expression(w["p+j,1"]^{int}),x=pos$left-mar$horizontal+mar$dist,y=pos$centre,col=blue) graphics::text(labels=expression(w["j,1"]^{ext}),x=pos$left+mar$horizontal-mar$dist,y=pos$centre,col=red) graphics::text(labels=expression(w["p+j,1"]^{ext}),x=pos$left+mar$horizontal+mar$dist,y=pos$centre,col=red) graphics::text(labels=expression(w["j,2"]^{ext}),x=pos$right-mar$horizontal-mar$dist,y=pos$centre,col=blue) graphics::text(labels=expression(w["p+j,2"]^{ext}),x=pos$right-mar$horizontal+mar$dist,y=pos$centre,col=blue) graphics::text(labels=expression(w["j,2"]^{int}),x=pos$right+mar$horizontal-mar$dist,y=pos$centre,col=red) graphics::text(labels=expression(w["p+j,2"]^{int}),x=pos$right+mar$horizontal+mar$dist,y=pos$centre,col=red) graphics::arrows(x0=c(pos$left,pos$right),y0=pos$centre-mar$vertical,y1=pos$bottom+mar$vertical,col=c(blue,red),length=0.1,lwd=2) graphics::text(labels=expression(hat(beta)["j,1"]^{final}==hat(gamma)["j,1"]-hat(gamma)["p+j,1"]),x=pos$left,y=pos$bottom,col=blue) graphics::text(labels=expression(hat(beta)["j,2"]^{final}==hat(gamma)["j,2"]-hat(gamma)["p+j,2"]),x=pos$right,y=pos$bottom,col=red) graphics::text(x=-0.1,y=c(pos$top,pos$bottom),labels=paste("stage",1:2),font=2,cex=cex) grDevices::dev.off() ``` # Simulation This chunk performs the simulation. - Requires: execution of chunk `setup` - Execution time: **0.5 hours** - Ensures: files `simulation_transfer.RData` (transfer learning), `simulation_multiple.RData` (multi-task learning) and `info_sim.txt` (session information) in folder `results` ```{r simulation,eval=sim.app} #<> repetitions <- 10 for(mode in c("transfer","multiple")){ grid <- expand.grid(prob.separate=c(0.0,0.025,0.05),prob.common=c(0.0,0.025,0.05),family="gaussian") grid <- grid[rep(seq_len(nrow(grid)),each=repetitions),] # grid$seed <- seq_len(nrow(grid)) grid$family <- as.character(grid$family) deviance <- auc <- time <- mse.coef <- mse.zero <- mse.nzero <- sel.num <- sel.coef <- sel.count <- hyperpar <- list() for(i in seq(from=1,to=nrow(grid))){ set.seed(seed=grid$seed[i]) cat("i=",i,"\n") if(mode=="transfer"){ data <- sim_data_trans(prob.common=grid$prob.common[i],prob.separate=grid$prob.separate[i],family=grid$family[i]) method <- c("wrap_separate","wrap_glmtrans","sparselink","wrap_xrnet") } else if(mode=="multiple"){ #--- multi-task learning --- data <- sim_data_multi(prob.common=grid$prob.common[i],prob.separate=grid$prob.separate[i],family=grid$family[i]) method <- c("wrap_separate","wrap_mgaussian","sparselink","wrap_spls") } result <- traintest(y_train=data$y_train,X_train=data$X_train,y_test=data$y_test,X_test=data$X_test,family=grid$family[i],method=method) hyperpar[[i]] <- result$hyperpar time[[i]] <- result$time auc[[i]] <- result$auc deviance[[i]] <- result$deviance sel.num[[i]] <- t(sapply(result$coef,function(x) colSums(x!=0))) sel.count[[i]] <- t(sapply(result$coef,function(x) rowMeans(count_matrix(truth=sign(data$beta),estim=sign(x))))) # Add na.rm=TRUE? sel.coef[[i]] <- t(sapply(result$coef,function(x) colMeans(sign(x)!=sign(data$beta)))) # CONTINUE HERE: consider sparsity, true positives, false negatives, signs mse.coef[[i]] <- t(sapply(result$coef,function(x) colMeans((data$beta-x)^2))) mse.zero[[i]] <- t(sapply(result$coef,function(x) colMeans(((data$beta==0)*(data$beta-x))^2))) mse.nzero[[i]] <- t(sapply(result$coef,function(x) colMeans(((data$beta!=0)*(data$beta-x))^2))) } save(grid,deviance,auc,sel.num,sel.count,sel.coef,mse.coef,mse.zero,mse.nzero,time,file=file.path(path,"results",paste0("simulation_",mode,".RData"))) } writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con=paste0(path,"/results/info_sim.txt")) ``` The following chunk generate the figures for the simulation study. - Requires: execution of chunk `setup`, files `simulation_transfer.RData` and `simulation_multiple.RData` in folder `results` (generated by chunk `simulation`) - execution time: $1$ second - Ensures: files `fig_sim_multiple.eps` and `fig_sim_transfer.eps` in folder `manuscript` ```{r simulation_figure,eval=fig.tab} #<> caption <- paste(c("\\textbf{Multi-task learning.}","\\textbf{Transfer learning.}"),"Comparison of different measures (rows) between an available method (red) and the proposed method (blue) in different simulation settings (columns), based on the average of three problems",c("(tasks)","(datasets)"),"for each repetition out of ten. Measures: performance metric (mean squared error on hold-out data, as a fraction of the one from standard lasso regression; a point below the dashed line means that",c("multi-task","transfer"),"learning improves predictions), sparsity (number of non-zero coefficients), precision (number of coefficients with correct signs divided by number of non-zero coefficients). The arrows point in the direction of improvement. Settings: percentage of features with a common effect for all problems ($\\pi_\\theta$), percentage of features with a specific effect for each problem ($\\pi_\\delta$).",c("\\label{fig_sim_multiple}","\\label{fig_sim_transfer}")) figure_change <- function(model0,model1="sparselink",model2){ mode <- paste0(100*grid$prob.common,"%\n",100*grid$prob.separate,"%") graphics::par(mfrow=c(3,1),mar=c(3,3,1,1)) label <- function(){ cex <- 0.5 at <- 0.3 graphics::mtext(text=expression(pi[theta]==phantom(.)),side=1,line=0.2,at=at,cex=cex) graphics::mtext(text=expression(pi[delta]==phantom(.)),side=1,line=1.2,at=at,cex=cex) } #--- predictive performance --- means <- t(sapply(X=deviance,FUN=rowMeans)) means <- means/means[,"wrap_separate"] plot_change(x=mode,y0=means[,model0],y1=means[,model1],y2=means[,model2],main="metric",increase=FALSE) graphics::abline(h=1,lty=2,col="grey") label() #--- sparsity --- nzero <- sapply(X=sel.num,FUN=rowMeans) plot_change(x=mode,y0=nzero[model0,],y1=nzero[model1,],y2=nzero[model2,],main="sparsity",increase=FALSE) graphics::abline(h=0,lty=2,col="grey") label() #--- precision --- precision <- sapply(X=sel.count,FUN=function(x) x[,"precision"]) precision[is.na(precision)] <- 0 plot_change(x=mode,y0=precision[model0,],y1=precision[model1,],y2=precision[model2,],main="precision",increase=TRUE) graphics::abline(h=0,lty=2,col="grey") label() } grDevices::postscript(file=file.path(path,"manuscript","fig_sim_multiple.eps"),width=6.5,height=6,horizontal=FALSE,onefile=FALSE,paper="special") load(file.path(path,paste0("results/simulation_multiple.RData")),verbose=TRUE) #model.ref <- "wrap_mgaussian" #model.own <- "sparselink" figure_change(model0="wrap_mgaussian",model1="sparselink",model2="wrap_spls") rowMeans(sapply(deviance,function(x) rank(rowMeans(x)))) rowMeans(sapply(deviance,function(x) colMeans(t(x)/x["wrap_separate",]))) runtime <- rowSums(sapply(time,function(x) x)) round(runtime/runtime["wrap_separate"],digits=2) grDevices::dev.off() grDevices::postscript(file=file.path(path,"manuscript","fig_sim_transfer.eps"),width=6.5,height=6,horizontal=FALSE,onefile=FALSE,paper="special") load(file.path(path,paste0("results/simulation_transfer.RData"))) #model.ref <- "wrap_glmtrans" #model.own <- "sparselink" figure_change(model0="wrap_glmtrans",model1="sparselink",model2="wrap_xrnet") rowMeans(sapply(deviance,function(x) rank(rowMeans(x)))) rowMeans(sapply(deviance,function(x) colMeans(t(x)/x["wrap_separate",]))) runtime <- rowSums(sapply(time,function(x) x)) round(runtime/runtime["wrap_separate"],digits=2) grDevices::dev.off() ``` ## Sample size and sparsity (revision) ```{r sim_extra,eval=sim.app} # Effect of sample size in source or target dataset (TL), effect of sample size (MTL). #<> repetitions <- 50 grid <- metric <- list() for(mode in c("MTL-size","TL-source","TL-target")){ #,"TL-prop","MTL-prop" metric[[mode]] <- list() cand <- c(20,40,60,80,100) if(mode=="MTL-size"){ grid[[mode]] <- expand.grid(prob.common=0.05,prob.separate=0.025,family="gaussian",n0=cand) } else if(mode=="TL-source"){ grid[[mode]] <- expand.grid(prob.common=0.05,prob.separate=0.025,family="gaussian",n_source=cand,n_target=50) } else if(mode=="TL-target"){ grid[[mode]] <- expand.grid(prob.common=0.05,prob.separate=0.025,family="gaussian",n_source=50,n_target=cand) } else if(mode %in% c("TL-prop","MTL-prop")){ cand <- c(0.025,0.05,0.10,0.15,0.20) grid[[mode]] <- expand.grid(prob.common=cand,prob.separate=NA,family="gaussian",n_source=50,n_target=50,n0=50) } else { stop("Wrong mode.") } grid[[mode]] <- grid[[mode]][rep(seq_len(nrow(grid[[mode]])),each=repetitions),] #grid[[mode]]$seed <- seq_len(nrow(grid[[mode]])) grid[[mode]]$seed <- rep(x=seq_len(repetitions),times=length(cand)) grid[[mode]]$family <- as.character(grid[[mode]]$family) cond <- is.na(grid[[mode]]$prob.separate) grid[[mode]]$prob.separate[cond] <- 0.5*grid[[mode]]$prob.common[cond] for(i in seq(from=1,to=nrow(grid[[mode]]))){ set.seed(seed=grid$seed[i]) cat("i=",i,"\n") if(mode %in% c("TL-source","TL-target","TL-prop")){ n0 <- rep(c(grid[[mode]]$n_source[i],grid[[mode]]$n_target[i]),times=c(2,1)) data <- sim_data_trans(prob.common=grid[[mode]]$prob.common[i],prob.separate=grid[[mode]]$prob.separate[i],family=grid[[mode]]$family[i],n0=n0) method <- c("wrap_separate","wrap_glmtrans","sparselink","wrap_xrnet") } else if(mode %in% c("MTL-size","MTL-prop")){ data <- sim_data_multi(prob.common=grid[[mode]]$prob.common[i],prob.separate=grid[[mode]]$prob.separate[i],family=grid[[mode]]$family[i],n0=grid[[mode]]$n0[i]) method <- c("wrap_separate","wrap_mgaussian","sparselink","wrap_spls") } else { stop("Wrong mode.") } result <- traintest(y_train=data$y_train,X_train=data$X_train,y_test=data$y_test,X_test=data$X_test,family=grid[[mode]]$family[i],method=method) metric[[mode]][[i]] <- result$deviance } } save(grid,metric,file=file.path(path,"results","simulation_devel.RData")) writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con=paste0(path,"/results/info_sim_extra.txt")) ``` ```{r sim_extra_figure,eval=fig.tab} #<> load(file.path(path,"results","simulation_devel.RData")) grDevices::postscript(file=file.path(path,"manuscript","fig_sim_extra.eps"),width=6.5,height=3,horizontal=FALSE,onefile=FALSE,paper="special") cex <- 0.8 graphics::par(mfrow=c(1,3),mar=c(4.5,4.5,1.5,1),oma=c(0,0,0,0)) #graphics::layout(mat=matrix(data=c(1,1,2,2,3,3,0,4,4,0,5,5),ncol=2)) for(mode in c("MTL-size","TL-source","TL-target")){ #for(mode in c("MTL-prop","TL-prop")){ if(mode %in% c("MTL-size","MTL-prop","TL-prop")){ mse <- sapply(metric[[mode]],function(x) rowMeans(x)) } else if(mode %in% c("TL-source","TL-target")){ mse <- sapply(metric[[mode]],function(x) x[,3]) } if(mode %in% c("TL-source","TL-target","TL-prop")){ col <- c("wrap_separate"="black","wrap_glmtrans"="red","wrap_xrnet"="orange","sparselink"="blue") lty <- c("wrap_separate"=3,"wrap_glmtrans"=2,"wrap_xrnet"=2,"sparselink"=1) } else if(mode %in% c("MTL-size","MTL-prop")) { col <- c("wrap_separate"="black","wrap_mgaussian"="red","wrap_spls"="orange","sparselink"="blue") lty <- c("wrap_separate"=3,"wrap_mgaussian"=2,"wrap_spls"=2,"sparselink"=1) } if(mode=="TL-source"){ params <- grid[[mode]]$n_source } else if(mode=="TL-target"){ params <- grid[[mode]]$n_target } else if(mode=="MTL-size"){ params <- grid[[mode]]$n0 } else if(mode %in% c("TL-prop","MTL-prop")){ params <- grid[[mode]]$prob.common } unique <- unique(params) graphics::plot.new() graphics::plot.window(xlim=range(params),ylim=range(log(mse))) graphics::box() if(mode=="MTL-size"){ main <- "MTL - varying sample size" xlab <- bquote("sample size ("~n[1]~"="~n[2]~"="~n[3]~")") legend <- "" } else if(mode=="TL-source"){ main <- "TL - varying source sample size" xlab <- bquote("source sample size ("~n[1]~"="~n[2]~")") legend <- bquote("target sample size:"~n[3]==.(unique(grid[[mode]]$n_target))) } else if(mode=="TL-target"){ main <- "TL - varying target sample size" xlab <- bquote("target sample size ("~n[3]~")") legend <- bquote("source sample size:"~n[1]~"="~n[2]==.(unique(grid[[mode]]$n_source))) } else if(mode=="MTL-prop"){ xlab <- "blabla" main <- "MTL - effect proportion" legend <- "" } else if(mode=="TL-prop"){ xlab <- "blabla" main <- "TL - effect proportion" legend <- "" } graphics::title(main=main,cex.main=cex) graphics::title(ylab="log MSE",line=2.5,xlab=xlab,cex.lab=cex) graphics::legend(x="topleft",legend=legend,bty="n",cex=cex) if(mode %in% c("TL-prop","MTL-prop")){ graphics::axis(side=1,at=unique,labels=paste0(100*unique,"%"),cex.axis=cex) } else { graphics::axis(side=1,at=unique,cex.axis=cex) } graphics::axis(side=2,cex.axis=cex) for(i in names(col)){ val <- tapply(X=mse[i,],INDEX=params,FUN=function(x) mean(x)) graphics::lines(x=unique,y=log(val),col=col[i],type="o",pch=16,lty=lty[i]) } } grDevices::dev.off() ``` # Application ## Data preparation This chunk defines the references and the project identifiers for the application. - Requires: nothing - Execution time: $1$ second - Ensures: list `project` in working environment ```{r define_projects,eval=fig.tab} project <- list() project$IBD <- c("Tew (2016)"="SRP063496", "Haberman (2019)"="SRP129004", "Verstockt (2019)"="ERP113396", "Verstockt (2020)"="ERP114636", "Boyd (2018)"="SRP100787") project$RA <- c("Baker (2019)"="SRP169062", "Moncrieffe (2017)"="SRP074736", "Goldberg (2018)"="SRP155483") extra <- c("Lewis (2019)"="ERP104864") # https://doi.org/10.1016/j.celrep.2019.07.091 ``` This chunk downloads the data for the application. - Requires: execution of chunks `setup` and `define_projects` - Execution time: depends on internet speed and cached files - Ensures: files `recount3_data.RData` (data sets) and `info_data.txt` (system information) in folder `results` ```{r download_data,eval=sim.app} #<> #<> data <- list() for(i in c(unlist(project),extra)){ data[[i]] <- recount3::create_rse_manual( project=i, project_home="data_sources/sra", organism="human", annotation = "gencode_v26", type="gene") } save(data,file=file.path(path,"results/recount3_data.RData")) writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con=paste0(path,"/results/info_data.txt")) ``` This chunk preprocesses the data. - Requires: execution of chunks `setup` and `define_projects`, file `recount3_data.RData` (generated by chunk `download_data`) - Execution time: $5$ seconds - Ensures: lists `y` (targets) and `x` (features) in working environment ```{r preprocess_data,eval=sim.app} #<> #<> load(file.path(path,"results/recount3_data.RData")) #- - - - - - - - - - - - - - - #- - - extract features - - - #- - - - - - - - - - - - - - - # extract features x <- list() for(i in c(unlist(project),extra)){ counts <- t(SummarizedExperiment::assays(data[[i]])$raw_counts) colnames(counts) <- SummarizedExperiment::rowRanges(data[[i]])$gene_name x[[i]] <- counts } # select most expressed protein-coding genes (for all TL projects together) select <- list() total <- numeric() for(i in unlist(project)){ #total <- rbind(total,Matrix::colSums(x[[i]])) # original: mean filtering total <- rbind(total,apply(X=x[[i]],MARGIN=2,FUN=stats::var)) # trial: variance filtering } type <- SummarizedExperiment::rowData(data[[i]])$gene_type cond <- type=="protein_coding" total[,!cond] <- 0 rank <- apply(X=total,MARGIN=1,FUN=rank) mean_rank <- rowMeans(rank) #temp <- cond & apply(total,2,function(x) all(x>0)) & (mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[2000]) # original: top 2000 temp <- cond & mean_rank >= sort(mean_rank[cond],decreasing=TRUE)[5000] # trial: top 5000 for(i in unlist(project)){ select[[i]] <- temp } # select most expressed protein-coding genes (for MTL project) #mean <- apply(X=x[[extra]],MARGIN=2,FUN=mean) # original var <- apply(X=x[[extra]],MARGIN=2,FUN=var) # trial #warning("change number in next line") #temp <- cond & mean >= sort(mean[cond],decreasing=TRUE)[5000] # trial: top 5000 temp <- cond & var >= sort(var[cond],decreasing=TRUE)[5000] # trial: top 5000 select[[extra]] <- temp # pre-processing for(i in c(unlist(project),extra)){ lib.size <- Matrix::rowSums(x[[i]]) x[[i]] <- x[[i]][,select[[i]],drop=FALSE] norm.factors <- edgeR::calcNormFactors(object=t(x[[i]]),lib.size=lib.size) gamma <- norm.factors*lib.size/mean(lib.size) gamma <- matrix(data=gamma,nrow=nrow(x[[i]]),ncol=ncol(x[[i]])) x[[i]] <- x[[i]]/gamma x[[i]] <- 2*sqrt(x[[i]] + 3/8) # Anscombe transform x[[i]] <- scale(x[[i]]) # scale because of different datasets!? } #- - - - - - - - - - - - - - #- - - extract targets - - - #- - - - - - - - - - - - - - # extract information on samples frame <- list() for(i in c(unlist(project),extra)){ list <- strsplit(data[[i]]$sra.sample_attributes,split="\\|") data[[i]]$sra.experiment_attributes # What about sra.experiment_attributes? n <- length(list) cols <- unique(sapply(strsplit(unlist(list),split=";;"),function(x) x[1])) ncol <- length(cols) frame[[i]] <- matrix(data=NA,nrow=n,ncol=ncol,dimnames=list(rownames(x[[i]]),cols)) for(j in seq_len(n)){ for(k in seq_len(ncol)){ vector <- list[[j]] which <- which(substring(text=vector,first=1,last=nchar(cols[k]))==cols[k]) string <- vector[which] if(length(string)==0){next} frame[[i]][j,k] <- strsplit(string,split=";;")[[1]][2] } } frame[[i]] <- as.data.frame(frame[[i]]) } # extract binary outcome y <- z <- list() for(i in unlist(project)){ # CONTINUE HERE!!! if(i=="ERP113396"){ y[[i]] <- sapply(X=frame[[i]]$`clinical history`,FUN=function(x) switch(EXPR=x,"responder"=1,"non-responder"=0,stop("invalid"))) } else if(i=="ERP114636"){ y[[i]] <- sapply(X=frame[[i]]$`clinical information`,FUN=function(x) switch(EXPR=x,"response to vedolizumab therapy"=1-1,"no response to vedolizumab therapy"=0+1,stop("invalid"))) warning("Inverting response and non-response!") } else if(i=="SRP100787"){ y[[i]] <- sapply(X=frame[[i]]$condition,FUN=function(x) switch(EXPR=x,"CD inactive"=1,"UC inactive"=1,"CD active"=0,"UC active"=0,control=NA,"NA"=NA,stop("invalid"))) } else if(i=="SRP129004"){ y[[i]] <- sapply(X=frame[[i]]$`week 4 remission`,FUN=function(x) switch(EXP=x,"Yes"=1,"No"=0,"NA"=NA,stop("invalid"))) suppressWarnings(z[[i]] <- data.frame(pucai=as.numeric(frame[[i]]$pucai),mayo=as.numeric(frame[[i]]$`total mayo score`),histology=as.numeric(frame[[i]]$`histology severity score`))) } else if(i=="SRP063496"){ y[[i]] <- sapply(X=frame[[i]]$`remission at week 10`,FUN=function(x) switch(x, "Remitter"=1,"Non-remitter"=0,"N/A"=NA,stop("invalid"))) } else if(i=="SRP169062"){ y[[i]] <- sapply(X=frame[[i]]$`flare event`,FUN=function(x) switch(x,"no flare"=1,"flare"=0,stop("invalid"))) } else if(i=="SRP155483"){ y[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=1,"Low"=0,"Moderate"=0,"High"=0,"--"=NA,stop("invalid"))) z[[i]] <- sapply(X=frame[[i]]$`disease activity`,FUN=function(x) switch(x,"remission"=0,"Low"=1,"Moderate"=2,"High"=3,"--"=NA,stop("invalid"))) } else if(i=="SRP074736"){ y[[i]] <- sapply(X=frame[[i]]$`mtx response status`,FUN=function(x) switch(x,"responder"=1,"non-responder"=0,"control"=NA,stop("invalid"))) } } # overlap for(j in unlist(project)){ is.na <- is.na(y[[j]]) if(length(is.na)!=nrow(x[[j]])){stop()} y[[j]] <- y[[j]][!is.na] if(!is.null(z[[j]])){ if(is.vector(z[[j]])){ z[[j]] <- z[[j]][!is.na] } else { z[[j]] <- z[[j]][!is.na,] } } x[[j]] <- x[[j]][!is.na,] } ``` ## Data exploration This chunk performs the exploratory data analysis. - Requires: execution of chunks `setup`, `define_projects` and `preprocess_data` - Execution time: $0.5$ minutes - Ensures: files `explore_data.RData` (results) and `info_explore.txt` (session information) in folder `results` ```{r explore_apply,eval=sim.app} #<> #<> #<> set.seed(1) alpha.holdout <- 0 alpha.crossval <- 1 family <- "binomial" nfolds <- 10 codes <- unlist(project) coef <- matrix(data=NA,nrow=ncol(x[[1]]),ncol=length(codes),dimnames=list(NULL,codes)) auc <- auc.pvalue <- matrix(data=NA,nrow=length(codes),ncol=length(codes),dimnames=list(codes,codes)) foldid <- make_folds_trans(y=y,family="binomial",nfolds=nfolds) ridge <- lasso <- list() for(i in seq_along(codes)){ ridge[[i]] <- glmnet::cv.glmnet(x=x[[codes[i]]],y=y[[codes[i]]],family=family,alpha=alpha.holdout,foldid=foldid[[i]]) coef[,i] <- stats::coef(ridge[[i]],s="lambda.min")[-1] for(j in seq_along(codes)){ if(i==j){ y_hat <- rep(x=NA,times=length(y[[i]])) for(k in seq_len(nfolds)){ holdout <- foldid[[i]]==k temp <- glmnet::cv.glmnet(x=x[[codes[i]]][!holdout,],y=y[[codes[i]]][!holdout],family=family,alpha=alpha.crossval) y_hat[holdout] <- predict(object=temp,newx=x[[codes[i]]][holdout,],s="lambda.min",type="response") } } else { y_hat <- as.numeric(predict(object=ridge[[i]],newx=x[[j]],s="lambda.min",type="response")) } auc[i,j] <- pROC::auc(response=y[[codes[j]]],predictor=y_hat,direction="<",levels=c(0,1)) auc.pvalue[i,j] <- stats::wilcox.test(rank(y_hat)~y[[codes[[j]]]],alternative="less",exact=FALSE)$p.value } } save(coef,auc,auc.pvalue,codes,file=file.path(path,"results","explore_data.RData")) writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con=paste0(path,"/results/info_explore.txt")) ``` This chunk generates the tables for the exploratory data analysis. - Requires: execution of chunk `setup`, file `explore_data.RData` in folder `results` (generated by chunk `explore_apply`) - execution time: $1$ second - Ensures: files `tab_cor.tex` and `tab_auc.tex` in folder `manuscript` ```{r explore_tables,eval=fig.tab,results="asis"} #<> #if(any(unlist(project)!=names(refs))){stop("not compatible")} load(file.path(path,"results/explore_data.RData")) names <- gsub(pattern="IBD.|RA.",replacement="",x=names(unlist(project))) codes <- colnames(coef) cor.pvalue <- matrix(data=NA,nrow=length(codes),ncol=length(codes),dimnames=list(codes,codes)) for(i in seq_along(codes)){ for(j in seq_along(codes)){ cor.pvalue[i,j] <- stats::cor.test(x=coef[,i],y=coef[,j],method="spearman",exact=FALSE)$p.value } } diag(cor.pvalue) <- NA insert.space <- function(table,cut){ index.left <- index.top <- seq_len(cut) index.right <- index.bottom <- seq(from=cut+1,to=ncol(table)) top <- cbind(table[index.top,index.left],"",table[index.top,index.right]) bottom <- cbind(table[index.bottom,index.left],"",table[index.bottom,index.right]) out <- rbind(top,"",bottom) colnames(out)[colnames(out)==""] <- " " return(out) } table <- stats::cor(coef,method="spearman") rownames(table) <- colnames(table) <- names black <- (!is.na(cor.pvalue)) & (cor.pvalue<=0.05) star <- (!is.na(cor.pvalue)) & (cor.pvalue<=0.05/choose(n=length(codes),k=2)) nonnegative <- table>=0 table <- format(round(table,digits=2),digits=2,trim=TRUE) table[nonnegative] <- paste0("\\phantom{-}",table[nonnegative]) table[!black] <- paste0("\\textcolor{gray}{",table[!black],"}") table[star] <- paste0(table[star],"$^\\star$") table[!star] <- paste0(table[!star],"\\phantom{$^\\star$}") #table[nonnegative] <- paste0("-",table[nonnegative]) diag(table) <- "-" table <- insert.space(table=table,cut=5) xtable <- xtable::xtable(x=table,align="rccccccccc",caption="Spearman correlation coefficients between the ridge regression coefficients from different datasets. Pairwise combinations of datasets with significantly correlated regression coefficients are highlighted, with black colour for nominal significance ($p$-value $\\leq 0.05$) and stars for adjusted significance ($p$-value $\\leq 0.05/28$). We expect a correlation coefficient close to $0$ for unrelated problems and close to $1$ for identical problems.",label="tab_cor") xtable::print.xtable(x=xtable,sanitize.text.function=identity,rotate.colnames=TRUE,caption.placement="top",hline.after=c(0,nrow(table)),comment=FALSE,file=file.path(path,"manuscript","tab_cor.tex"),floating.environment="table*") #add.to.row=list(pos=list(5),command="\\hdashline \n") table <- auc rownames(table) <- colnames(table) <- names table <- format(round(table,digits=2),digits=2) black <- auc.pvalue<=0.05 star <- auc.pvalue<=0.05/(length(codes)*length(codes)) diag(table) <- paste0("(",diag(table),")") table[!black] <- paste0("\\textcolor{gray}{",table[!black],"}") table[star] <- paste0(table[star],"$^\\star$") table[!star] <- paste0(table[!star],"\\phantom{$^\\star$}") table <- insert.space(table=table,cut=5) xtable <- xtable::xtable(x=table,align="rccccccccc",caption="Out-of-sample area under the receiver operating characteristic curve (\\textsc{roc-auc}) from logistic ridge regression trained on the dataset in the row and tested on the dataset in the column (off-diagonal entries), or cross-validated \\textsc{roc-auc} from logistic lasso regression trained and tested on the same dataset by $10$-fold external cross-validation (diagonal entries, between brackets). The \\textsc{roc-auc} of a random classifier is $0.5$, while that of a perfect classifier is $1.0$. Entries on and off the diagonal are not comparable. Predictions that are significantly better than random predictions (according to the one-sided Mann-Whitney $U$ test for testing whether the ranks of the predicted probabilities are significantly higher for the cases than for the controls) are highlighted, with black colour for nominal significance ($p$-value $\\leq 0.05$) and stars for adjusted significance ($p$-value $\\leq 0.05/64$).",label="tab_auc") xtable::print.xtable(x=xtable,sanitize.text.function=identity,rotate.colnames=TRUE,caption.placement="top",hline.after=c(0,nrow(table)),comment=FALSE,file=file.path(path,"manuscript","tab_auc.tex"),floating.environment="table*") ``` ## Transfer learning This chunk performs the transfer learning analysis. - Requires: execution of chunks `setup` and `define_projects`, file `recount3_data.RData` in folder `results` (generated by chunk `download_data`), execution of chunk `preprocess_data` - Execution time: **1.5 hours** - Ensures: `application.RData` (results) and `info_app.txt` (session information) in folder `results` ```{r transfer_apply,eval=sim.app} #<> #<> #<> result <- list() for(i in names(project)){ cat("project:",i,"\n") result[[i]] <- list() for(j in seq_len(5)){ # 5 repetitions of 10-fold CV set.seed(j) codes <- project[[i]] result[[i]][[j]] <- cv_transfer(y=y[codes],X=x[codes],family="binomial",method=c("wrap_separate","wrap_glmtrans","sparselink","wrap_xrnet"),alpha.init=ifelse(i=="RA",0,0.95)) # lasso-like elastic net for IBD, ridge for RA (weak signal) } } save(result,project,file=file.path(path,"results","application.RData")) writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con=paste0(path,"/results/info_app.txt")) # debugging if(FALSE){ set.seed(1) codes <- project[["RA"]] test <- wrap_xrnet(x=x[codes],y=y[codes],alpha.init=0.95,alpha=1) } ``` This chunk generates the figure on the predictive performance. - Requires: execution of chunk `setup`, file `application.RData` in folder `results` (generated by chunk `transfer_apply`) - Execution time: $1$ second - Ensures: file `fig_app.eps` in folder `manuscript` ```{r transfer_figure,eval=fig.tab} #<> grDevices::postscript(file=file.path(path,"manuscript","fig_app.eps"),width=6.5,height=4,horizontal=FALSE,onefile=FALSE,paper="special") graphics::par(mfrow=c(2,1),mar=c(4,2,1,1),oma=c(0,0,0,0)) load(file.path(path,paste0("results/application.RData")),verbose=TRUE) model0 <- "wrap_glmtrans" model1 <- "sparselink" model2 <- "wrap_xrnet" # predictivity metric <- lapply(result,function(x) do.call(what="rbind",args=lapply(x,function(x) x$auc))) # DEV and AUC need different directions (increase=FALSE/TRUE)! metric <- do.call(what="rbind",args=metric) metric <- metric/metric[,"wrap_separate"] #xlab <- refs[rownames(metric)] #names <- gsub(pattern="",replacement="\n",x=unlist(project)) label <- gsub(pattern="IBD.|RA.",replacement="",x=gsub(pattern=" ",replacement="\n",x=names(unlist(project)))) index <- match(x=rownames(metric),table=unlist(project)) xlab <- label[index] plot_change(x=xlab,y0=metric[,model0],y1=metric[,model1],y2=metric[,model2],main="metric",increase=TRUE,cex.main=0.8) graphics::axis(side=1,at=length(project$IBD)+0.5,labels="|",tick=FALSE,line=-0.25,font=2) graphics::abline(h=0.5,lty=2,col="grey") graphics::abline(h=1,lty=2,col="grey") # sparsity nzero <- lapply(result,function(x) lapply(x,function(x) sapply(x$refit$coef,function(x) colSums(x!=0)))) nzero <- do.call(what="rbind",args=do.call(what="c",args=nzero)) plot_change(x=xlab,y0=nzero[,model0],y1=nzero[,model1],y2=nzero[,model2],main="sparsity",increase=FALSE,cex.main=0.8) graphics::axis(side=1,at=length(project$IBD)+0.5,labels="|",tick=FALSE,line=-0.25,font=2) graphics::abline(h=0,lty=2,col="grey") grDevices::dev.off() # percentage change # (reported in section 4 "application" subsection 4.3 "transfer learning") disease <- ifelse(rownames(metric) %in% project$IBD,"IBD",ifelse(rownames(metric) %in% project$RA,"RA",NA)) #round(100*colMeans(metric)-100,digits=2) round(100*colMeans(metric[disease=="IBD",])-100,digits=2) round(100*colMeans(metric[disease=="RA",])-100,digits=2) colMeans(nzero) colMeans(nzero[disease=="IBD",]) colMeans(nzero[disease=="RA",]) #--- revision: report AUC --- Reduce(f="+",x=lapply(result$IBD,function(x) x$auc))/length(result$IBD) lapply(result,function(x) round(colMeans(Reduce(f="+",x=lapply(x,function(x) x$auc))/length(result$IBD)),digits=2)) ``` This chunk generates the figure on feature selection. - Requires: execution of chunk `setup`, file `application.RData` in folder `results` (generated by chunk `transfer_apply`) - Execution time: $1$ second - Ensures: file `fig_coef.eps` in folder `manuscript` ```{r interpret_models,eval=fig.tab} #<> load(file.path(path,"results","application.RData")) coefs <- list() for(i in seq_along(result$IBD)){ coefs[[i]] <- result$IBD[[i]]$refit$coef$sparselink colnames(coefs[[i]]) <- names(project$IBD) rownames(coefs[[i]]) <- rownames(result$IBD[[1]]$refit$coef$wrap_glmtrans) # try to avoid this } any <- rowSums(sapply(coefs,function(x) apply(x,1,function(x) any(x!=0))))!=0 for(i in seq_along(result$IBD)){ coefs[[i]] <- coefs[[i]][any,] } table <- Reduce(f="+",x=coefs)/5 cex <- 0.7 grDevices::postscript(file=file.path(path,"manuscript","fig_coef.eps"),width=7,height=4,horizontal=FALSE,onefile=FALSE,paper="special") graphics::par(mfrow=c(1,1),mar=c(2.5,4.5,0.5,1.5),oma=c(0,0,0,0)) graphics::plot.new() graphics::plot.window(xlim=c(0.6,ncol(table)+0.4),ylim=c(0.5,nrow(table)+0.5)) col <- apply(table,1,function(x) ifelse(all(x<=0),"blue",ifelse(all(x>=0),"red","black"))) colnames <- gsub(x=colnames(table),pattern=" ",replacement="\n") graphics::mtext(text=colnames,side=1,at=seq_len(ncol(table)),cex=cex,line=1) rownames <- rownames(table) graphics::mtext(text=rownames,side=2,at=seq_len(nrow(table)),las=2,cex=cex,line=0.7,col=col) star <- rowSums(table!=0)>1 graphics::mtext(text="*",side=2,at=which(star),line=-0.3) graphics::mtext(text=ifelse(col=="blue","-",ifelse(col=="red","+",".")),side=4,at=seq_len(nrow(table)),las=2,cex=cex,line=0.5,col=col) for(i in seq_len(nrow(table))){ for(j in seq_len(ncol(table))){ for(k in 1:5){ col <- ifelse(coefs[[k]][i,j]<0,"blue",ifelse(coefs[[k]][i,j]>0,"red","white")) cex <- pmax(sqrt(5*abs(coefs[[k]][i,j])),0.2) graphics::points(x=j-(-3+k)*0.17,y=i,col=col,cex=cex,pch=16) } } } graphics::abline(v=seq(from=0.5,to=5.5,by=1)) grDevices::dev.off() ``` This chunk saves the session information for generating figures and tables. ```{r session_info,echo=FALSE,eval=fig.tab} writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con=paste0(path,"/results/info_figtab.txt")) ```