## ----vignetteSetup, echo=FALSE, message=FALSE, warning = FALSE-------------------------------------------------------- ## Track time spent on making the vignette startTime <- Sys.time() ## Bib setup library("RefManageR") ## Write bibliography information bib <- c( derfinder = citation("derfinder")[1], BiocStyle = citation("BiocStyle")[1], knitr = citation("knitr")[3], rmarkdown = citation("rmarkdown")[1], brainspan = RefManageR::BibEntry( bibtype = "Unpublished", key = "brainspan", title = "Atlas of the Developing Human Brain [Internet]. Funded by ARRA Awards 1RC2MH089921-01, 1RC2MH090047-01, and 1RC2MH089929-01.", author = "BrainSpan", year = 2011, url = "http://www.brainspan.org/" ), originalder = citation("derfinder")[2], R = citation(), IRanges = citation("IRanges")[1], sessioninfo = citation("sessioninfo")[1], testthat = citation("testthat")[1], GenomeInfoDb = RefManageR::BibEntry( bibtype = "manual", key = "GenomeInfoDb", author = "Sonali Arora and Martin Morgan and Marc Carlson and H. Pagès", title = "GenomeInfoDb: Utilities for manipulating chromosome and other 'seqname' identifiers", year = 2017, doi = "10.18129/B9.bioc.GenomeInfoDb" ), GenomicRanges = citation("GenomicRanges")[1], ggplot2 = citation("ggplot2")[1], bumphunter = citation("bumphunter")[1], TxDb.Hsapiens.UCSC.hg19.knownGene = citation("TxDb.Hsapiens.UCSC.hg19.knownGene")[1], AnnotationDbi = RefManageR::BibEntry( bibtype = "manual", key = "AnnotationDbi", author = "Hervé Pagès and Marc Carlson and Seth Falcon and Nianhua Li", title = "AnnotationDbi: Annotation Database Interface", year = 2017, doi = "10.18129/B9.bioc.AnnotationDbi" ), BiocParallel = citation("BiocParallel")[1], derfinderHelper = citation("derfinderHelper")[1], GenomicAlignments = citation("GenomicAlignments")[1], GenomicFeatures = citation("GenomicFeatures")[1], GenomicFiles = citation("GenomicFiles")[1], Hmisc = citation("Hmisc")[1], qvalue = citation("qvalue")[1], Rsamtools = citation("Rsamtools")[1], rtracklayer = citation("rtracklayer")[1], S4Vectors = RefManageR::BibEntry( bibtype = "manual", key = "S4Vectors", author = "Hervé Pagès and Michael Lawrence and Patrick Aboyoun", title = "S4Vectors: S4 implementation of vector-like and list-like objects", year = 2017, doi = "10.18129/B9.bioc.S4Vectors" ), bumphunterPaper = RefManageR::BibEntry( bibtype = "article", key = "bumphunterPaper", title = "Bump hunting to identify differentially methylated regions in epigenetic epidemiology studies", author = "Jaffe, Andrew E and Murakami, Peter and Lee, Hwajin and Leek, Jeffrey T and Fallin, M Daniele and Feinberg, Andrew P and Irizarry, Rafael A", year = 2012, journal = "International Journal of Epidemiology" ), derfinderData = citation("derfinderData")[1], RefManageR = citation("RefManageR")[1] ) ## ----'start', message=FALSE------------------------------------------------------------------------------------------- ## Load libraries library("derfinder") library("derfinderData") library("GenomicRanges") ## ----'phenoData', results = 'asis'------------------------------------------------------------------------------------ library("knitr") ## Get pheno table pheno <- subset(brainspanPheno, structure_acronym == "AMY") ## Display the main information p <- pheno[, -which(colnames(pheno) %in% c( "structure_acronym", "structure_name", "file" ))] rownames(p) <- NULL kable(p, format = "html", row.names = TRUE) ## ----'getData', eval = .Platform$OS.type != "windows"----------------------------------------------------------------- ## Determine the files to use and fix the names files <- rawFiles(system.file("extdata", "AMY", package = "derfinderData"), samplepatt = "bw", fileterm = NULL ) names(files) <- gsub(".bw", "", names(files)) ## Load the data from disk system.time(fullCov <- fullCoverage( files = files, chrs = "chr21", totalMapped = rep(1, length(files)), targetSize = 1 )) ## ----'getDataWindows', eval = .Platform$OS.type == "windows", echo = FALSE-------------------------------------------- # ## Load data in Windows case # foo <- function() { # load(system.file("extdata", "fullCov", "fullCovAMY.RData", # package = "derfinderData" # )) # return(fullCovAMY) # } # fullCov <- foo() ## ----'webData', eval = FALSE------------------------------------------------------------------------------------------ # ## Determine the files to use and fix the names # files <- pheno$file # names(files) <- gsub(".AMY", "", pheno$lab) # # ## Load the data from the web # system.time(fullCov <- fullCoverage( # files = files, chrs = "chr21", # totalMapped = rep(1, length(files)), targetSize = 1 # )) ## ----'exploreFullCov'------------------------------------------------------------------------------------------------- ## Lets explore it fullCov ## ----'filterCov'------------------------------------------------------------------------------------------------------ ## Filter coverage filteredCov <- lapply(fullCov, filterData, cutoff = 2) ## ----'exploreFilteredCov'--------------------------------------------------------------------------------------------- ## Similar to fullCov but with $position filteredCov ## ----'compareCov'----------------------------------------------------------------------------------------------------- ## Compare the size in Mb round(c( fullCov = object.size(fullCov), filteredCov = object.size(filteredCov) ) / 1024^2, 1) ## ----'regionMatrix'--------------------------------------------------------------------------------------------------- ## Use regionMatrix() system.time(regionMat <- regionMatrix(fullCov, cutoff = 30, L = 76)) ## Explore results class(regionMat) names(regionMat$chr21) ## ----'exploreRegMatRegs'---------------------------------------------------------------------------------------------- ## regions output regionMat$chr21$regions ## Number of regions length(regionMat$chr21$regions) ## ----'exploreRegMatBP'------------------------------------------------------------------------------------------------ ## Base-level coverage matrices for each of the regions ## Useful for plotting lapply(regionMat$chr21$bpCoverage[1:2], head, n = 2) ## Check dimensions. First region is 565 long, second one is 138 bp long. ## The columns match the number of samples (12 in this case). lapply(regionMat$chr21$bpCoverage[1:2], dim) ## ----'exploreRegMatrix'----------------------------------------------------------------------------------------------- ## Dimensions of the coverage matrix dim(regionMat$chr21$coverageMatrix) ## Coverage for each region. This matrix can then be used with limma or other pkgs head(regionMat$chr21$coverageMatrix) ## ----'identifyDERsDESeq2'--------------------------------------------------------------------------------------------- ## Required library("DESeq2") ## Round matrix counts <- round(regionMat$chr21$coverageMatrix) ## Round matrix and specify design dse <- DESeqDataSetFromMatrix(counts, pheno, ~ group + gender) ## Perform DE analysis dse <- DESeq(dse, test = "LRT", reduced = ~gender, fitType = "local") ## Extract results deseq <- regionMat$chr21$regions mcols(deseq) <- c(mcols(deseq), results(dse)) ## Explore the results deseq ## ----'buildLimmaModels'----------------------------------------------------------------------------------------------- ## Build models mod <- model.matrix(~ pheno$group + pheno$gender) mod0 <- model.matrix(~ pheno$gender) ## ----'transformForLimma'---------------------------------------------------------------------------------------------- ## Transform coverage transformedCov <- log2(regionMat$chr21$coverageMatrix + 32) ## ----'identifyDERsLimma'---------------------------------------------------------------------------------------------- ## Example using limma library("limma") ## Run limma fit <- lmFit(transformedCov, mod) fit0 <- lmFit(transformedCov, mod0) ## Determine DE status for the regions ## Also in https://github.com/LieberInstitute/jaffelab with help and examples getF <- function(fit, fit0, theData) { rss1 <- rowSums((fitted(fit) - theData)^2) df1 <- ncol(fit$coef) rss0 <- rowSums((fitted(fit0) - theData)^2) df0 <- ncol(fit0$coef) fstat <- ((rss0 - rss1) / (df1 - df0)) / (rss1 / (ncol(theData) - df1)) f_pval <- pf(fstat, df1 - df0, ncol(theData) - df1, lower.tail = FALSE) fout <- cbind(fstat, df1 - 1, ncol(theData) - df1, f_pval) colnames(fout)[2:3] <- c("df1", "df0") fout <- data.frame(fout) return(fout) } ff <- getF(fit, fit0, transformedCov) ## Get the p-value and assign it to the regions limma <- regionMat$chr21$regions limma$fstat <- ff$fstat limma$pvalue <- ff$f_pval limma$padj <- p.adjust(ff$f_pval, "BH") ## Explore the results limma ## ----limmaVSdeseq2---------------------------------------------------------------------------------------------------- table(limma$padj < 0.05, deseq$padj < 0.05) ## ----'createMeanBW', eval = .Platform$OS.type != "windows"------------------------------------------------------------ ## Calculate the mean: this step takes a long time with many samples meanCov <- Reduce("+", fullCov$chr21) / ncol(fullCov$chr21) ## Save it on a bigwig file called meanChr21.bw createBw(list("chr21" = DataFrame("meanChr21" = meanCov)), keepGR = FALSE ) ## ----'railMatrix', eval = .Platform$OS.type != "windows"-------------------------------------------------------------- ## Identify files to use summaryFile <- "meanChr21.bw" ## We had already found the sample BigWig files and saved it in the object 'files' ## Lets just rename it to sampleFiles for clarity. sampleFiles <- files ## Get the regions system.time( regionMat.rail <- railMatrix( chrs = "chr21", summaryFiles = summaryFile, sampleFiles = sampleFiles, L = 76, cutoff = 30, maxClusterGap = 3000L ) ) ## ----'checkDifferences', eval = .Platform$OS.type != "windows"-------------------------------------------------------- ## Overall not identical due to small rounding errors identical(regionMat, regionMat.rail) ## Actual regions are the same identical(ranges(regionMat$chr21$regions), ranges(regionMat.rail$chr21$regions)) ## When you round, the small differences go away identical( round(regionMat$chr21$regions$value, 4), round(regionMat.rail$chr21$regions$value, 4) ) identical( round(regionMat$chr21$regions$area, 4), round(regionMat.rail$chr21$regions$area, 4) ) ## ----'libSize'-------------------------------------------------------------------------------------------------------- ## Get some idea of the library sizes sampleDepths <- sampleDepth(collapseFullCoverage(fullCov), 1) sampleDepths ## ----'makeModels'----------------------------------------------------------------------------------------------------- ## Define models models <- makeModels(sampleDepths, testvars = pheno$group, adjustvars = pheno[, c("gender")] ) ## Explore the models lapply(models, head) ## ----'analyze'-------------------------------------------------------------------------------------------------------- ## Create a analysis directory dir.create("analysisResults") originalWd <- getwd() setwd(file.path(originalWd, "analysisResults")) ## Perform differential expression analysis system.time(results <- analyzeChr( chr = "chr21", filteredCov$chr21, models, groupInfo = pheno$group, writeOutput = TRUE, cutoffFstat = 5e-02, nPermute = 20, seeds = 20140923 + seq_len(20), returnOutput = TRUE )) ## ----'exploreResults'------------------------------------------------------------------------------------------------- ## Explore names(results) ## ----'exploreOptionsStats'-------------------------------------------------------------------------------------------- ## Explore optionsStats names(results$optionsStats) ## Call used results$optionsStats$analyzeCall ## ----'exploreCovPrep'------------------------------------------------------------------------------------------------- ## Explore coveragePrep names(results$coveragePrep) ## Group means results$coveragePrep$groupMeans ## ----'exploreFstats'-------------------------------------------------------------------------------------------------- ## Explore optionsStats results$fstats ## Note that the length matches the number of bases used identical(length(results$fstats), sum(results$coveragePrep$position)) ## ----'exploreRegs'---------------------------------------------------------------------------------------------------- ## Explore regions names(results$regions) ## ----'exploreRegs2'--------------------------------------------------------------------------------------------------- ## Permutation summary information results$regions[2:4] ## ----'exploreRegs3'--------------------------------------------------------------------------------------------------- ## Candidate DERs results$regions$regions ## ----'sensitivity'---------------------------------------------------------------------------------------------------- ## Width of potential DERs summary(width(results$regions$regions)) sum(width(results$regions$regions) > 50) ## Width of candidate DERs sig <- as.logical(results$regions$regions$significant) summary(width(results$regions$regions[sig])) sum(width(results$regions$regions[sig]) > 50) ## ----'exploreAnnotation'---------------------------------------------------------------------------------------------- ## Nearest annotation head(results$annotation) ## ----'exploreTime', fig.cap = "Seconds used to run each step in analyzeChr()."---------------------------------------- ## Time spent results$timeinfo ## Use this information to make a plot timed <- diff(results$timeinfo) timed.df <- data.frame(Seconds = as.numeric(timed), Step = factor(names(timed), levels = rev(names(timed)) )) library("ggplot2") ggplot(timed.df, aes(y = Step, x = Seconds)) + geom_point() ## ----'mergeResults'--------------------------------------------------------------------------------------------------- ## Go back to the original directory setwd(originalWd) ## Merge results from several chromosomes. In this case we only have one. mergeResults( chrs = "chr21", prefix = "analysisResults", genomicState = genomicState$fullGenome, optionsStats = results$optionsStats ) ## Files created by mergeResults() dir("analysisResults", pattern = ".Rdata") ## ----'optionsMerge'--------------------------------------------------------------------------------------------------- ## Options used to merge load(file.path("analysisResults", "optionsMerge.Rdata")) ## Contents names(optionsMerge) ## Merge call optionsMerge$mergeCall ## ----'exploreFullRegs'------------------------------------------------------------------------------------------------ ## Load all the regions load(file.path("analysisResults", "fullRegions.Rdata")) ## Metadata columns names(mcols(fullRegions)) ## ----'exploreFullAnnoRegs'-------------------------------------------------------------------------------------------- ## Load annotateRegions() output load(file.path("analysisResults", "fullAnnotatedRegions.Rdata")) ## Information stored names(fullAnnotatedRegions) ## Take a peak lapply(fullAnnotatedRegions, head) ## ----'extra'---------------------------------------------------------------------------------------------------------- ## Find overlaps between regions and summarized genomic annotation annoRegs <- annotateRegions(fullRegions, genomicState$fullGenome) ## Indeed, the result is the same because we only used chr21 identical(annoRegs, fullAnnotatedRegions) ## Get the region coverage regionCov <- getRegionCoverage(fullCov, fullRegions) ## Explore the result head(regionCov[[1]]) ## ----'derfinderPlot', eval = FALSE------------------------------------------------------------------------------------ # library("derfinderPlot") # # ## Overview of the candidate DERs in the genome # plotOverview( # regions = fullRegions, annotation = results$annotation, # type = "fwer" # ) # # suppressPackageStartupMessages(library("TxDb.Hsapiens.UCSC.hg19.knownGene")) # txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene # # ## Base-levle coverage plots for the first 10 regions # plotRegionCoverage( # regions = fullRegions, regionCoverage = regionCov, # groupInfo = pheno$group, nearestAnnotation = results$annotation, # annotatedRegions = annoRegs, whichRegions = 1:10, txdb = txdb, scalefac = 1, # ask = FALSE # ) # # ## Cluster plot for the first region # plotCluster( # idx = 1, regions = fullRegions, annotation = results$annotation, # coverageInfo = fullCov$chr21, txdb = txdb, groupInfo = pheno$group, # titleUse = "fwer" # ) ## ----'featureLevel'--------------------------------------------------------------------------------------------------- ## Get the exon-level matrix system.time(exonCov <- coverageToExon(fullCov, genomicState$fullGenome, L = 76)) ## Dimensions of the matrix dim(exonCov) ## Explore a little bit tail(exonCov) ## ----'regionMatAnnotate'---------------------------------------------------------------------------------------------- ## Annotate regions as exonic, intronic or intergenic system.time(annoGenome <- annotateRegions( regionMat$chr21$regions, genomicState$fullGenome )) ## Note that the genomicState object included in derfinder only has information ## for chr21 (hg19). ## Identify closest genes to regions suppressPackageStartupMessages(library("bumphunter")) suppressPackageStartupMessages(library("TxDb.Hsapiens.UCSC.hg19.knownGene")) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene genes <- annotateTranscripts(txdb) system.time(annoNear <- matchGenes(regionMat$chr21$regions, genes)) ## ----'static-vis', eval = FALSE--------------------------------------------------------------------------------------- # ## Identify the top regions by highest total coverage # top <- order(regionMat$chr21$regions$area, decreasing = TRUE)[1:100] # # ## Base-level plots for the top 100 regions with transcript information # library("derfinderPlot") # plotRegionCoverage(regionMat$chr21$regions, # regionCoverage = regionMat$chr21$bpCoverage, # groupInfo = pheno$group, nearestAnnotation = annoNear, # annotatedRegions = annoGenome, whichRegions = top, scalefac = 1, # txdb = txdb, ask = FALSE # ) ## ----'epivizr', eval = FALSE------------------------------------------------------------------------------------------ # ## Load epivizr, it's available from Bioconductor # library("epivizr") # # ## Load data to your browser # mgr <- startEpiviz() # ders_dev <- mgr$addDevice( # fullRegions[as.logical(fullRegions$significantFWER)], "Candidate DERs" # ) # ders_potential_dev <- mgr$addDevice( # fullRegions[!as.logical(fullRegions$significantFWER)], "Potential DERs" # ) # regs_dev <- mgr$addDevice(regionMat$chr21$regions, "Region Matrix") # # ## Go to a place you like in the genome # mgr$navigate( # "chr21", start(regionMat$chr21$regions[top[1]]) - 100, # end(regionMat$chr21$regions[top[1]]) + 100 # ) # # ## Stop the navigation # mgr$stopServer() ## ----'exportBigWig', eval = .Platform$OS.type != "windows"------------------------------------------------------------ ## Subset only the first sample fullCovSmall <- lapply(fullCov, "[", 1) ## Export to BigWig bw <- createBw(fullCovSmall) ## See the file. Note that the sample name is used to name the file. dir(pattern = ".bw") ## Internally createBw() coerces each sample to a GRanges object before ## exporting to a BigWig file. If more than one sample was exported, the ## GRangesList would have more elements. bw ## ----'exampleNameStyle', eval = FALSE--------------------------------------------------------------------------------- # ## Set global species and chrsStyle options # options(species = "arabidopsis_thaliana") # options(chrsStyle = "NCBI") # # ## Then proceed to load and analyze the data ## ----'analyzeNonHuman', eval = FALSE---------------------------------------------------------------------------------- # ## Load transcript database information # library("TxDb.Athaliana.BioMart.plantsmart28") # # ## Set organism options # options(species = "arabidopsis_thaliana") # options(chrsStyle = "NCBI") # # ## Run command with more arguments # analyzeChr(txdb = TxDb.Athaliana.BioMart.plantsmart28) ## ----'runExample'----------------------------------------------------------------------------------------------------- ## Find some regions to work with example("loadCoverage", "derfinder") example("getRegionCoverage", "derfinder") ## ----'loadWhich'------------------------------------------------------------------------------------------------------ ## Illustrate reading data from a set of regions test <- loadCoverage( files = files, chr = "21", cutoff = NULL, which = regions, protectWhich = 0, fileStyle = "NCBI" ) ## Some reads were ignored and thus the coverage is lower as can be seen below: sapply(test$coverage, max) - sapply(genomeDataRaw$coverage, max) ## ----'loadWhich2'----------------------------------------------------------------------------------------------------- ## Illustrate reading data from a set of regions test2 <- loadCoverage( files = files, chr = "21", cutoff = NULL, which = regions, protectWhich = 3e4, fileStyle = "NCBI" ) ## Adding some padding to the regions helps get the same coverage identical(sapply(test2$coverage, max), sapply(genomeDataRaw$coverage, max)) ## A more detailed test reveals that the coverage matches at every base all(mapply(function(x, y) { identical(x, y) }, test2$coverage, genomeDataRaw$coverage)) ## ----Figure1, out.width="100%", fig.align="center", fig.cap = "derfinder F-statistics flow chart.", echo = FALSE------ knitr::include_graphics("http://lcolladotor.github.io/derfinder/fig/DERpathway.png") ## ----Figure2, out.width="100%", fig.align="center", fig.cap = "analyzeChr() flow chart.", echo = FALSE---------------- knitr::include_graphics("http://lcolladotor.github.io/derfinder/fig/analyzeChr.png") ## ----Figure3, out.width="100%", fig.align="center", fig.cap = "regionMatrix() flow chart.", echo = FALSE-------------- knitr::include_graphics("http://lcolladotor.github.io/derfinder/fig/regionMatrix.png") ## ----'derfinder-analysis', eval = FALSE------------------------------------------------------------------------------- # ## Run derfinder's analysis steps with timing info # # ## Load libraries # library("getopt") # # ## Available at http:/bioconductor.org/packages/derfinder # library("derfinder") # # ## Specify parameters # spec <- matrix(c( # "DFfile", "d", 1, "character", "path to the .Rdata file with the results from loadCoverage()", # "chr", "c", 1, "character", "Chromosome under analysis. Use X instead of chrX.", # "mcores", "m", 1, "integer", "Number of cores", # "verbose", "v", 2, "logical", "Print status updates", # "help", "h", 0, "logical", "Display help" # ), byrow = TRUE, ncol = 5) # opt <- getopt(spec) # # ## Testing the script # test <- FALSE # if (test) { # ## Speficy it using an interactive R session and testing # test <- TRUE # } # # ## Test values # if (test) { # opt <- NULL # opt$DFfile <- "/ProjectDir/derCoverageInfo/chr21Cov.Rdata" # opt$chr <- "21" # opt$mcores <- 1 # opt$verbose <- NULL # } # # ## if help was asked for print a friendly message # ## and exit with a non-zero error code # if (!is.null(opt$help)) { # cat(getopt(spec, usage = TRUE)) # q(status = 1) # } # # ## Default value for verbose = TRUE # if (is.null(opt$verbose)) opt$verbose <- TRUE # # if (opt$verbose) message("Loading Rdata file with the output from loadCoverage()") # load(opt$DFfile) # # ## Make it easy to use the name later. Here I'm assuming the names were generated using output='auto' in loadCoverage() # eval(parse(text = paste0("data <- ", "chr", opt$chr, "CovInfo"))) # eval(parse(text = paste0("rm(chr", opt$chr, "CovInfo)"))) # # ## Just for testing purposes # if (test) { # tmp <- data # tmp$coverage <- tmp$coverage[1:1e6, ] # library("IRanges") # tmp$position[which(tmp$pos)[1e6 + 1]:length(tmp$pos)] <- FALSE # data <- tmp # } # # ## Load the models # load("models.Rdata") # # ## Load group information # load("groupInfo.Rdata") # # # ## Run the analysis with lowMemDir # analyzeChr( # chr = opt$chr, coverageInfo = data, models = models, # cutoffFstat = 1e-06, cutoffType = "theoretical", nPermute = 1000, # seeds = seq_len(1000), maxClusterGap = 3000, groupInfo = groupInfo, # subject = "hg19", mc.cores = opt$mcores, # lowMemDir = file.path(tempdir(), paste0("chr", opt$chr), "chunksDir"), # verbose = opt$verbose, chunksize = 1e5 # ) # # ## Done # if (opt$verbose) { # print(proc.time()) # print(sessionInfo(), locale = FALSE) # } ## ----createVignette, eval=FALSE--------------------------------------------------------------------------------------- # ## Create the vignette # library("rmarkdown") # system.time(render("derfinder-users-guide.Rmd", "BiocStyle::html_document")) # # ## Extract the R code # library("knitr") # knit("derfinder-users-guide.Rmd", tangle = TRUE) ## ----createVignette2-------------------------------------------------------------------------------------------------- ## Clean up file.remove("derfinderUsersGuideRef.bib") unlink("analysisResults", recursive = TRUE) file.remove("HSB113.bw") file.remove("meanChr21.bw") ## ----reproducibility1, echo=FALSE------------------------------------------------------------------------------------- ## Date the vignette was generated Sys.time() ## ----reproducibility2, echo=FALSE------------------------------------------------------------------------------------- ## Processing time in seconds totalTime <- diff(c(startTime, Sys.time())) round(totalTime, digits = 3) ## ----reproducibility3, echo=FALSE------------------------------------------------------------------------------------- ## Session info library("sessioninfo") options(width = 120) session_info() ## ----vignetteBiblio, results = 'asis', echo = FALSE, warning = FALSE, message = FALSE--------------------------------- ## Print bibliography PrintBibliography(bib, .opts = list(hyperlink = "to.doc", style = "html"))