## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----echo=FALSE, fig.cap="Fig 1. BioTIP workflow with five key analytical steps. RTF: relative transcript fluctuation; PCC: Pearson correlation coefficient; DNB: Dynamic Network Biomarker, of which the score was called as Module-Criticality Index (MCI) in the R package BioTIP; Ic: index of critical transition.; Ic*: our redefined Ic, which is also referred to as the BioTIP score", fig.align='center', out.width = '60%'---- knitr::include_graphics("FigS1_BioTIP_pipeline_detailed_v7.jpg") ## ----echo=FALSE, fig.cap="Table 1. The key R functions and customaized parameters for the five analytical steps.", fig.align='center', out.width = '60%'---- knitr::include_graphics("Fig_Table1_xy.jpg") ## ----warning=FALSE, message=FALSE--------------------------------------------- # load BioTIP and dependent packages library(BioTIP) library(cluster) library(GenomicRanges) library(MASS) require(stringr) require(psych) require(igraph) ## ----------------------------------------------------------------------------- data(GSE6136_matrix) dim(GSE6136_matrix) row.names(GSE6136_matrix) = GSE6136_matrix$ID_REF # remove the downloaded first row after assigning it to be column-name of the final numeric data matrix GSE6136 = GSE6136_matrix[,-1] dim(GSE6136) ## ----------------------------------------------------------------------------- data(GSE6136_cli) ##dim(GSE6136_cli) optional: check dimension cli = t(GSE6136_cli) colnames(cli) = str_split_fixed(cli[1,],'_',2)[,2] cli = cli[-1,] cli = data.frame(cli) cli[,"cell-type:ch1"] = str_split_fixed(cli$characteristics_ch1.1,": ",2)[,2] cli[,"Ig clonality:ch1"] = str_split_fixed(cli$characteristics_ch1.3,": ",2)[,2] colnames(cli)[colnames(cli) == "cell-type:ch1"] = "group" cli$Row.names = cli[,1] head(cli[,1:3]) ## ----------------------------------------------------------------------------- dat <- GSE6136 df <- log2(dat+1) head(df[,1:6]) ## ----warning=FALSE------------------------------------------------------------ cli$group = factor(cli$group, levels = c('resting','activated','lymphoma_marginal','lymphoma_transitional','lymphoma_aggressive')) samplesL <- split(cli[,"geo_accession"],f = cli$group) lapply(samplesL, length) test <- sd_selection(df, samplesL,0.01) head(test[["activated"]]) ## ----warning=FALSE------------------------------------------------------------ igraphL <- getNetwork(test, fdr = 1) cluster <- getCluster_methods(igraphL) ## ----echo=TRUE, warning=FALSE------------------------------------------------- names(cluster) ## ----echo=TRUE, warning=FALSE------------------------------------------------- membersL_noweight <- getMCI(cluster,test) plotBar_MCI(membersL_noweight,ylim = c(0,6)) ## ----echo=TRUE, warning=FALSE------------------------------------------------- maxMCIms <- getMaxMCImember(membersL_noweight[[1]],membersL_noweight[[2]],min =10) names(maxMCIms) names(maxMCIms[[1]]) names(maxMCIms[[2]]) ## ----echo=TRUE, warning=FALSE------------------------------------------------- head(maxMCIms[['idx']]) head(maxMCIms[['members']][['lymphoma_aggressive']]) ## ----------------------------------------------------------------------------- biomodules = getMaxStats(membersL_noweight[['members']],maxMCIms[[1]]) maxMCI = getMaxStats(membersL_noweight[['MCI']],maxMCIms[[1]]) maxMCI = maxMCI[order(maxMCI,decreasing=TRUE)] head(maxMCI) topMCI = getTopMCI(membersL_noweight[[1]],membersL_noweight[[2]],membersL_noweight[['MCI']],min =10) head(topMCI) ## ----------------------------------------------------------------------------- maxSD = getMaxStats(membersL_noweight[['sd']],maxMCIms[[1]]) head(maxSD) ## ----------------------------------------------------------------------------- CTS = getCTS(topMCI, maxMCIms[[2]]) ## ----echo=TRUE, warning=FALSE------------------------------------------------- par(mar = c(10,5,0,2)) plotMaxMCI(maxMCIms,membersL_noweight[[2]],states = names(samplesL),las = 2) ## ----warning=FALSE, results=FALSE--------------------------------------------- simuMCI <- simulationMCI(3,samplesL,df, B=100) plot_MCI_Simulation(topMCI[1],simuMCI,las=2) ## ----------------------------------------------------------------------------- IC <- getIc(df,samplesL,CTS[[1]],PCC_sample.target = 'average') par(mar = c(10,5,0,2)) plotIc(IC,las = 2) ## ----warning=FALSE, results=FALSE--------------------------------------------- simuIC <- simulation_Ic(length(CTS[[1]]),samplesL,df,B=100) par(mar = c(10,5,0,2)) plot_Ic_Simulation(IC,simuIC,las = 2) ## ----------------------------------------------------------------------------- sample_Ic = simulation_Ic_sample(df, sampleNo=3, genes=CTS[[1]], plot=TRUE) ##simulated_Ic = plot_simulation_sample(df,length(samplesL[['lymphoma_aggressive']]),IC[['lym#phoma_aggressive']],CTS) ## ----echo=FALSE, fig.cap="Fig 2. BioTIP Single-Cell Expression Data-Analytical Pipeline.", fig.align='center', out.width = '60%'---- knitr::include_graphics("Fig1_scRNARNAseq_pipeline_2021_xy.jpg") ## ----echo=FALSE, out.height='60%', out.width='60%'---------------------------- knitr::include_graphics("subway_map.jpg") ## ----warning=FALSE, message=FALSE, results=FALSE------------------------------ library(BioTIP) ## ----warning=FALSE, message=FALSE--------------------------------------------- #Load stringr library library(stringr) #BioTIP dependent libraries library(cluster) library(psych) library(stringr) library(GenomicRanges) # library(Hmisc) library(MASS) library(igraph) # library(RCurl) ## ----------------------------------------------------------------------------- ## familiarize yourself with data cell_info = read.delim("cell_info.tsv", head = T, sep = '\t', row.names=1) dim(cell_info) ## in case R automatically reformated characters in cell labels, match cell labels in the data and in the infor table rownames(cell_info) = sub('LT-HSC', 'LT.HSC', rownames(cell_info)) ## focus on one branch, e.g., S4-S3-S0-S1 ## no filter because it was already 4k genes, 10% of the originally measured genes in the single cell experiment cell_info <- subset(cell_info, branch_id_alias %in% c("('S4', 'S3')", "('S3', 'S0')", "('S1', 'S0')" )) ## check cell sub-population sizes along the STREAM outputs (t = table(cell_info$label, cell_info$branch_id_alias)) ## focus on sub-populations of cells with more than 10 cells ( idx = apply(t, 2, function(x) names(which(x>10))) ) ## generate a list of samples (cells) of interest samplesL = sapply(names(idx), function(x) sapply(idx[[x]], function(y) rownames(subset(cell_info, branch_id_alias == x & label == y)))) ## check the number of sub-populations along the pseudotime trajectory (generated by STREAM for example) ## recommend to focus on each branch, respectively names(samplesL[[1]]) ## merge sub-populations at each pseudo-time 'state' samplesL = do.call(c, samplesL) lengths(samplesL) ## load normalized gene expression matrix, refer to the example below if not yet normalized ## e.g, here is a STREAM-published data matrix of normalized gene expression matrix ## when analyzing SingleCellExperiment object SCE, translate to matrix using the following code # counts <- logcounts(SCE) ## data matrix imported from our github repository due to large size githuburl = "https://github.com/ysun98/BioTIPBigData/blob/master/data_Nestorowa.tsv?raw=true" counts = read.table(url(githuburl), head=T, sep="\t", row.names=1) dim(counts) if (class(counts)=="data.frame") counts = as.matrix(counts) all(colnames(counts) %in% as.character(rownames(cell_info))) ## log2 transformation example if(max(counts)>20) counts = log2(counts) ## Check the overall distribution using histogram # hist (counts, 100) ## ----results=FALSE------------------------------------------------------------ ## First, estimate the random Ic-scores by permuting the expression values of genes RandomIc_g = list() set.seed(2020) C= 10 # for real data analysis C = at least 500 i = 1 n <- 200 # randomly pick up 200 genes RandomIc_g[[i]] <- simulation_Ic(n, samplesL, counts, B=C, fun="BioTIP") names(RandomIc_g)[i] = paste0("Simulation",i,"gene") medianIc = apply(RandomIc_g[[i]],1,median) par(mfrow = c(1,1)) plot_Ic_Simulation(medianIc, RandomIc_g[[i]], las = 2, ylab="BioTIP", main= paste("Simulation using",n,"transcripts"), fun="boxplot", ylim=c(0,0.3)) ## ----------------------------------------------------------------------------- ## setup parameters for gene preselection B=10 ##for demonstration purposes use 10, when running dataset we recommend at least 100 ##optimize and one nonoptimize for single cell use optimize with B 100 or higher for demo purpose only we run opt.cutoff = 0.2 opt.per = 0.8 ## Commented out because of calculation expense, recommend to save the calculated data file after one calculation for future use ## set.seed(100) ## subcounts = optimize.sd_selection(counts, samplesL, cutoff = opt.cutoff, percent=opt.per, B) ## save( subcounts, file='BioTIP_Output_xy/subcounts.optimize_80per.rData') data("subcounts.optimize_80per") ## ----warning=FALSE------------------------------------------------------------ ## use 0.01-0.2 to ensure gene-gene co-expression is significant ## increase fdr cutoff to obtain larger gene sets network.fdr = 0.1 min.size = 50 igraphL = getNetwork(subcounts, fdr=network.fdr) names(igraphL) ## Network partition using random walk cluster = getCluster_methods(igraphL) tmp = igraphL[["('S3', 'S0').CMP"]] E(tmp)$width <- E(tmp)$weight*3 V(tmp)$community= cluster[["('S3', 'S0').CMP"]]$membership mark.groups = table(cluster[["('S3', 'S0').CMP"]]$membership) colrs = rainbow(length(mark.groups), alpha = 0.3) V(tmp)$label <- NA plot(tmp, vertex.color=colrs[V(tmp)$community],vertex.size = 5, mark.groups=cluster[["('S3', 'S0').CMP"]]) ## ----warning=FALSE------------------------------------------------------------ fun = 'BioTIP' ## Commented out because of calculation expense, recommend to save the calculated data file after one calculation for future use ## membersL = getMCI(cluster, subcounts, adjust.size = F, fun) ## save(membersL, file="membersL.RData", compress=TRUE) data("membersL") par(mar=c(1,1,2,1)) plotBar_MCI(membersL, ylim=c(0,30), min=50) ## Decide how many states of interest, here is 4 n.state.candidate <- 4 topMCI = getTopMCI(membersL[["members"]], membersL[["MCI"]], membersL[["MCI"]], min=min.size, n=n.state.candidate) names(topMCI) ## Obtain state ID and MCI statistics for the n=3 leading MCI scores per state maxMCIms = getMaxMCImember(membersL[["members"]], membersL[["MCI"]], min =min.size, n=3) names(maxMCIms) ## list the maximum MCI score per state, for all states maxMCI = getMaxStats(membersL[['MCI']], maxMCIms[['idx']]) unlist(maxMCI) #### extract biomodule candidates in the following steps: #### ## record the gene members per toppest module for each of these states of interest CTS = getCTS(maxMCI[names(topMCI)], maxMCIms[["members"]][names(topMCI)]) ## tmp calculates the number of bars within each named state tmp = unlist(lapply(maxMCIms[['idx']][names(topMCI)], length)) ## here returns all the groups with exactly 2 bars (whoistop2nd = names(tmp[tmp==2])) ## here returns all the groups with exactly 3 bars (whoistop3rd = names(tmp[tmp==3])) ## add the gene members of the 2nd toppest biomodue in the states with exactly 2 bars if(length(whoistop2nd)>0) CTS = append(CTS, maxMCIms[["2topest.members"]][whoistop2nd]) names(CTS) ## add the gene members of the 2nd toppest biomodue in the states with exactly 3 bars if(length(whoistop3rd)>0) CTS = append(CTS, maxMCIms[["2topest.members"]][whoistop3rd]) names(CTS) ## add the gene members of the 3rd toppest biomodue in the states with exactly 3 bars if(length(whoistop3rd)>0) CTS = append(CTS, maxMCIms[["3topest.members"]][whoistop3rd]) names(CTS) ## ----results=FALSE------------------------------------------------------------ #### extract CTS scores for each biomodule candidate in the following steps: #### ## first to record the max MCI for the n.state.candidate maxMCI <- maxMCI[names(CTS)[1:n.state.candidate]] maxMCI ## then applendix the 2nd highest MCI score (if existing) for the states with exactly 2 bars if(length(whoistop2nd)>0) maxMCI <- c(maxMCI, getNextMaxStats(membersL[['MCI']], idL=maxMCIms[['idx']], whoistop2nd)) names(maxMCI) ## applendix the 2nd highest MCI score (if existing) for the states with exactly 3 bars if(length(whoistop3rd)>0) maxMCI <- c(maxMCI, getNextMaxStats(membersL[['MCI']], idL=maxMCIms[['idx']], whoistop3rd)) names(maxMCI) ## then applendix the 3rd highest MCI score (if existing) for the states with exactly 3 bars if(length(whoistop3rd)>0) maxMCI <- c(maxMCI, getNextMaxStats(membersL[['MCI']], idL=maxMCIms[['idx']], whoistop3rd, which.next=3)) maxMCI ## to ensure the same order between maxMCI and CTS all(names(CTS) == names(maxMCI)) ## estimate empritical significance from the MCI scores ## M is precalculated correlation matrix for large dataset (>2k genes), will be reused in the downstream simulation analysis #counts = read.table("data_Nestorowa.tsv") #if (class(counts)=="data.frame") counts = as.matrix(counts) #M <- cor.shrink(counts, Y = NULL, MARGIN = 1, shrink = TRUE) #save(M, file="cor.shrink_M.RData", compress=TRUE) #dim(M) ## C is the runs of permutations to estimate random scores #C = 10 # for real data analysis C = at least 200 #RandomMCI = list() #n <- length(CTS) # number of CTS candidates #set.seed(2020) #for (i in 1:n) # # i=1; par(mfrow=c(1,1)) # x <- length(CTS[[i]]) # RandomMCI[[i]] <- simulationMCI(x, samplesL, counts, B=C, fun="BioTIP", M=M) # dim(RandomMCI) # plot_MCI_Simulation(maxMCI[i], RandomMCI[[i]], las=2, # ylim=c(0, max(maxMCI[i], 2*RandomMCI[[i]])), # main=paste(names(maxMCI)[i], length(CTS[[i]]), "genes", # "\n","vs. ",C, "times of gene-permutation"), # which2point=names(maxMCI)[i]) ######## Finding Tipping Point ################# newIc_score = lapply(CTS, function(x) getIc(counts, samplesL, x, fun="BioTIP", PCC_sample.target = 'average')) names(newIc_score) <- names(CTS) ## ----echo=FALSE, fig.align='center', out.width = '60%'------------------------ knitr::include_graphics("DNB_Sim_M.jpg") ## ----results=FALSE------------------------------------------------------------ ######## verify using IC score ################# ## First, estimate the random Ic-scores by permuating the expresion values of genes RandomIc_g = list() set.seed(2020) C= 10 # for real data analysis C = at least 500 # for(i in 1:length(CTS)){ Not to run the full loop B# } i = 1 CTS <- CTS[[i]] n <- length(CTS) RandomIc_g[[i]] <- simulation_Ic(n, samplesL, counts, B=C, fun="BioTIP") names(RandomIc_g)[i] = names(CTS)[i] # par(mfrow=c(1,length(int))) # for(i in 1:length(newIc_score)){ Not to run the full loop B plotting par(mfrow = c(1,1)) n = length(CTS[[i]]) plot_Ic_Simulation(newIc_score[[i]], RandomIc_g[[i]], las = 2, ylab="BioTIP", main= paste(names(newIc_score)[i],"(",n," transcripts)"), #fun="matplot", which2point=names(newIc_score)[i]) fun="boxplot", which2point=names(newIc_score)[i], ylim=c(0,0.5)) interesting = which(names(samplesL) == names(newIc_score[i])) p = length(which(RandomIc_g[[i]][interesting,] >= newIc_score[[i]][names(newIc_score)[i]])) p = p/ncol(RandomIc_g[[i]]) # first p value (p1) calculated for exactly at tipping point p2 = length(which(RandomIc_g[[i]] >= newIc_score[[i]][names(newIc_score)[i]])) p2 = p2/ncol(RandomIc_g[[i]]) p2 = p2/nrow(RandomIc_g[[i]]) # second p value (p2) calculated across all statuses ## local Kernel Density Plot d <- density(RandomIc_g[[i]]) # returns the density data plot(d, xlim=range(c(newIc_score[[i]],RandomIc_g[[i]])), main=paste("Random genes: p.Local=",p)) # plots the results abline(v=newIc_score[[i]][names(newIc_score)[i]], col="green") ## global Kernel Density Plot d <- density(unlist(RandomIc_g)) # returns the density data plot(d, xlim=range(c(newIc_score[[i]],unlist(RandomIc_g))), main=paste("Random genes: p.Global=",p2)) # plots the results abline(v=newIc_score[[i]][names(newIc_score)[i]], col="green") # } Not to run the full loop B plotting ## Second, estimate the random Ic-scores by randomly shulffing cell labels RandomIc_s = list() set.seed(2020) # for(i in 1:length(CTS)){ Not to run the full loop C i = 1 RandomIc_s[[i]] <- matrix(nrow=length(samplesL), ncol=C) rownames(RandomIc_s[[i]]) = names(samplesL) for(j in 1:length(samplesL)) { ns <- length(samplesL[[j]]) # of cells at the state of interest RandomIc_s[[i]][j,] <- simulation_Ic_sample(counts, ns, Ic=BioTIP_score[x], genes=CTS, B=C, fun="BioTIP") } names(RandomIc_s)[i] = names(CTS)[i] # } Not to run the full loop C # par(mfrow=c(1,length(int))) # for(i in 1:length(newIc_score)){ Not to run the full loop C plotting n = length(CTS[[i]]) plot_Ic_Simulation(newIc_score[[i]], RandomIc_s[[i]], las = 2, ylab="BioTIP", main= paste(names(newIc_score)[i],"(",n," transcripts)"), fun="boxplot", which2point=names(newIc_score)[i]) interesting = which(names(samplesL) == names(newIc_score[i])) p = length(which(RandomIc_s[[i]][interesting,] >= newIc_score[[i]][names(newIc_score)[i]])) p = p/ncol(RandomIc_s[[i]]) # first p value (p1) calculated for exactly at tipping point p2 = length(which(RandomIc_s[[i]] >= newIc_score[[i]][names(newIc_score)[i]])) p2 = p2/ncol(RandomIc_s[[i]]) p2 = p2/nrow(RandomIc_s[[i]]) p p2 # } Not to run the full loop C plotting ## Alternatively, p values is estimated from delta scores P3 = plot_SS_Simulation(newIc_score[[i]], RandomIc_s[[i]], xlim=c(0, max(newIc_score[[i]], RandomIc_s[[i]])), las=2, main=paste(names(CTS)[i], length(CTS[[i]]), "genes, ", "\n", C, "Shuffling labels")) ## ----------------------------------------------------------------------------- P3 ## ----echo=FALSE, fig.align='center', out.width = '60%'------------------------ knitr::include_graphics("CTS_Id.jpg") knitr::include_graphics("delta.jpg") ## ----echo=FALSE, fig.align='center', out.width = '65%'------------------------ knitr::include_graphics("Fig2.jpg") ## ----------------------------------------------------------------------------- library(BioTIP) data(gencode) head(gencode) ## ----"python code", eval = FALSE---------------------------------------------- # gtf = ("Your/PATH/TO/THE/FILE") # outF = open("gtf_summary_transbiotype.txt","w") # # def getquote(str,f,target): # targetLen = len(target)+2 # strInd = str.find(target) # st = strInd + len(target)+2 # ed = st + str[st:].find("";") # #print(st,ed) # f.write("\t"+str[st:ed]) if strInd!= -1 else f.write("\t"+"NA.") # # with open(gtf, "r") as f: # for line in f: # if line[0] != "#": # chromosome = line.split("\t")[0] # st = line.split("\t")[3] # ed = line.split("\t")[4] # strand = line.split("\t")[6] # type = line.split("\t")[2] # outF.write(chromosome+"\t"+st+"\t"+ed+"\t"+strand+"\t"+type) # c = "transcript_id" # g = "gene_name" # t = "transcript_type" # getquote(line,outF,c) # getquote(line,outF,g) # getquote(line,outF,t) # outF.write("\n") # outF.close() ## ----------------------------------------------------------------------------- library(BioTIP) library(GenomicRanges) data(gencode) head(gencode) ## ----------------------------------------------------------------------------- query <- GRanges(c("chr1:2-10:+","chr1:6-10:-"), Row.names = c("trans1","trans2"), score = c(1,2)) head(query) ## ----------------------------------------------------------------------------- library(BioTIP) gr <- GRanges(c("chr1:1-5:+","chr1:2-3:+"),biotype = c("lincRNA","CPC")) head(gr) ## ----------------------------------------------------------------------------- intron <- GRanges("chr1:6-8:+") head(intron) ## ----------------------------------------------------------------------------- intron_trncp <- getBiotypes(query, gr, intron) intron_trncp ## ----------------------------------------------------------------------------- library(BioTIP) data("intron") data("ILEF") data("gencode") gencode_gr = GRanges(gencode) ILEF_gr = GRanges(ILEF) cod_gr = GRanges(cod) intron_gr = GRanges(intron) non_coding <- getBiotypes(ILEF_gr, gencode_gr, intron_gr) dim(non_coding) head(non_coding[,1:3]) ## ----------------------------------------------------------------------------- coding <- getBiotypes(ILEF_gr, gencode_gr) dim(coding) head(coding[,1:3]) ## ----------------------------------------------------------------------------- library(BioTIP) data(ILEF) data(cod) ILEF_gr = GRanges(ILEF) cod_gr = GRanges(cod) rdthrough <- getReadthrough(ILEF_gr, cod_gr) head(rdthrough) ## ----SessionInfo-------------------------------------------------------------- sessionInfo()