% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/BuildRef.R
\name{Build-Reference-methods}
\alias{Build-Reference-methods}
\alias{getResources}
\alias{buildRef}
\alias{buildFullRef}
\alias{getNonPolyARef}
\alias{getAvailableGO}
\title{Builds reference files used by SpliceWiz}
\usage{
getResources(
  reference_path = "./Reference",
  fasta = "",
  gtf = "",
  overwrite = FALSE,
  force_download = FALSE,
  verbose = TRUE
)

buildRef(
  reference_path = "./Reference",
  fasta = "",
  gtf = "",
  overwrite = FALSE,
  force_download = FALSE,
  chromosome_aliases = NULL,
  genome_type = "",
  nonPolyARef = "",
  MappabilityRef = "",
  BlacklistRef = "",
  ontologySpecies = "",
  useExtendedTranscripts = TRUE,
  lowMemoryMode = TRUE,
  verbose = TRUE
)

buildFullRef(
  reference_path = "./Reference",
  fasta = "",
  gtf = "",
  use_STAR_mappability = FALSE,
  overwrite = FALSE,
  force_download = FALSE,
  chromosome_aliases = NULL,
  genome_type = "",
  nonPolyARef = "",
  MappabilityRef = "",
  BlacklistRef = "",
  ontologySpecies = "",
  useExtendedTranscripts = TRUE,
  verbose = TRUE,
  n_threads = 4,
  ...
)

getNonPolyARef(genome_type)

getAvailableGO(localHub = FALSE, ah = AnnotationHub(localHub = localHub))
}
\arguments{
\item{reference_path}{(REQUIRED) The directory path to store the generated
reference files}

\item{fasta}{The file path or web link to the user-supplied genome
FASTA file. Alternatively, the name of the AnnotationHub record containing
the genome resource. May be omitted if \code{getResources()} has already
been run using the same \code{reference_path}.}

\item{gtf}{The file path or web link  to the user-supplied transcript
GTF file (or gzipped GTF file). Alternatively, the name of the
AnnotationHub record containing the transcript GTF file. May be omitted if
\code{getResources()} has already been run using the same
\code{reference_path}.}

\item{overwrite}{(default \code{FALSE}) For \code{getResources()}: if the
genome FASTA and gene annotation GTF files already exist in the \code{resource}
subdirectory, it will not be overwritten. For \code{buildRef()} and
\code{buildFullRef()}: the SpliceWiz reference will not be overwritten
if one already exist. A reference is considered to exist if
the file \code{SpliceWiz.ref.gz} is present inside \code{reference_path}.}

\item{force_download}{(default \code{FALSE}) When online resources are retrieved,
a local copy is stored in the \code{SpliceWiz} BiocFileCache. Subsequent calls
to the web resource will fetch the local copy. Set \code{force_download} to
\code{TRUE} will force the resource to be downloaded from the web. Set this to
\code{TRUE} only if the web resource has been updated since the last retrieval.}

\item{verbose}{(default \code{TRUE}) If \code{FALSE}, will silence progress messages}

\item{chromosome_aliases}{(Highly optional) A 2-column data frame containing
chromosome name conversions. If this is set, allows \link{processBAM} to parse
BAM alignments to a genome whose chromosomes are named
differently to the reference genome. The most common scenario is where
Ensembl genome typically use chromosomes "1", "2", ..., "X", "Y", whereas
UCSC/Gencode genome use "chr1", "chr2", ..., "chrX", "chrY". See example
below. Refer to \url{https://github.com/dpryan79/ChromosomeMappings} for a
list of chromosome alias resources.}

\item{genome_type}{Allows \code{buildRef()} to select default
\code{nonPolyARef} and \code{MappabilityRef} for selected genomes. Allowed options
are: \code{hg38}, \code{hg19}, \code{mm10}, and \code{mm9}.}

\item{nonPolyARef}{(Optional) A BED file of regions defining known
non-polyadenylated transcripts. This file is used for QC analysis
to measure Poly-A enrichment quality of samples. An RDS file (openable
using \code{readRDS()}) of a GRanges object is acceptable.
If omitted, and \code{genome_type} is defined, the default for the specified
genome will be used.}

\item{MappabilityRef}{(Optional) A BED file of low mappability regions due to
repeat elements in the genome. If omitted, the file generated by
\code{\link[=calculateMappability]{calculateMappability()}} will be used where available, and if
this is not, the default file for the specified \code{genome_type} will be used.
If \code{genome_type} is not specified, \code{MappabilityRef} is not used.
An RDS file (openable using \code{readRDS()}) of a GRanges object is acceptable.
See details.}

\item{BlacklistRef}{A BED file of regions to be otherwise excluded from IR
analysis. If omitted, a blacklist is not used (this is the default).
An RDS file (openable using \code{readRDS()}) of a GRanges object is acceptable.}

\item{ontologySpecies}{(default \code{""}) The species for which gene ontology
classifications should be fetched from AnnotationHub. Ignored if
\code{genome_type} is set (as human or mouse GO will be used instead).}

\item{useExtendedTranscripts}{(default \code{TRUE}) Should non-protein-coding
transcripts such as anti-sense and lincRNA transcripts be included in
searching for IR / AS events? Setting \code{FALSE} (vanilla IRFinder) will
exclude transcripts other than \code{protein_coding} and
\code{processed_transcript} transcripts from IR analysis.}

\item{lowMemoryMode}{(default \code{TRUE}) By default, SpliceWiz converts FASTA
files to TwoBit, then uses the TwoBit file to fetch genome sequences. In
most cases, this method uses less memory and is faster, but can be very
slow on some systems. Set this option to \code{FALSE} (which will convert the
TwoBit file back to FASTA) if you experience
very slow genome fetching (e.g. when annotating splice motifs).}

\item{use_STAR_mappability}{(default FALSE) In \code{buildFullRef()},
whether to run \link{STAR_mappability} to calculate low-mappability regions.
We recommend setting this to \code{FALSE} for the common genomes
(human and mouse), and to \code{TRUE} for genomes not supported by
\code{genome_type}. When set to false, the MappabilityExclusion default file
corresponding to \code{genome_type} will automatically be used.}

\item{n_threads}{The number of threads used to generate the STAR reference
and mappability calculations. Multi-threading is not used for SpliceWiz
reference generation (but multiple cores are utilised in data-table
and fst file processing automatically, where available). See \link{STAR-methods}}

\item{...}{For \code{buildFullRef()}, additional parameters to be parsed into
\code{STAR_buildRef} which \code{buildFullRef()} runs internally. See \link{STAR_buildRef}}

\item{localHub}{(default \code{FALSE}) For \code{getAvailableGO()}, whether to use
offline mode for AnnotationHub resources. If \code{TRUE}, offline mode will be
used.}

\item{ah}{For \code{getAvailableGO()}, the AnnotationHub object. Leave as default
to use the entirety of AnnotationHub resources.}
}
\value{
For \code{getResources}: creates the following local resources:
\itemize{
\item \code{reference_path/resource/genome.2bit}: Local copy of the genome sequences
as a TwoBitFile.
\item \code{reference_path/resource/transcripts.gtf.gz}: Local copy of the gene
annotation as a gzip-compressed file.
}

For \code{buildRef()} and \code{buildFullRef()}: creates a SpliceWiz reference
which is written to the given directory specified by \code{reference_path}.
Files created includes:
\itemize{
\item \code{reference_path/settings.Rds}: An RDS file containing parameters used
to generate the SpliceWiz reference
\item \code{reference_path/SpliceWiz.ref.gz}: A gzipped text file containing collated
SpliceWiz reference files. This file is used by \link{processBAM}
\item \verb{reference_path/fst/}: Contains fst files for subsequent easy access to
SpliceWiz generated references
\item \code{reference_path/cov_data.Rds}: An RDS file containing data required to
visualise genome / transcript tracks.
}

\code{buildFullRef()} also creates a \code{STAR} reference located in the \code{STAR}
subdirectory inside the designated \code{reference_path}

For \code{getNonPolyARef()}: Returns the file path to the BED file for
the nonPolyA loci for the specified genome.

For \code{getAvailableGO()}: Returns a vector containing names of species with
supported gene ontology annotations.
}
\description{
These function builds the reference required by the SpliceWiz engine, as well
as alternative splicing annotation data for SpliceWiz. See examples
below for guides to making the SpliceWiz reference.
}
\details{
\code{getResources()} processes the files, downloads resources from
web links or from \code{AnnotationHub()}, and saves a local copy in the "resource"
subdirectory within the given \code{reference_path}. Resources are retrieved via
either:
\enumerate{
\item User-supplied FASTA and GTF file. This can be a file path, or a web link
(e.g. 'http://', 'https://' or 'ftp://'). Use \code{fasta} and \code{gtf}
to specify the files or web paths to use.
\item AnnotationHub genome and gene annotation (Ensembl): supply the names of
the genome sequence and gene annotations to \code{fasta} and \code{gtf}.
}

\code{buildRef()} will first run \code{getResources()} if resources are
not yet saved locally (i.e. \code{getResources()} is not already run).
Then, it creates the SpliceWiz references. Typical run-times are
5 to 10 minutes for human and mouse genomes (after resources are downloaded).

NB: the parameters \code{fasta} and \code{gtf} can be omitted in \code{buildRef()} if
\code{getResources()} is already run.

\code{buildFullRef()} builds the STAR aligner reference alongside the SpliceWiz
reference. The STAR reference will be located in the \code{STAR} subdirectory
of the specified reference path. If \code{use_STAR_mappability} is set to \code{TRUE}
this function will empirically compute regions of low mappability. This
function requires \code{STAR} to be installed on the system (which only runs on
linux-based systems).

\code{getNonPolyARef()} returns the path of the non-polyA reference file for the
human and mouse genomes.

Typical usage involves running \code{buildRef()} for human and mouse genomes
and specifying the \code{genome_type} to use the default \code{MappabilityRef} and
\code{nonPolyARef} files for the specified genome. For non-human non-mouse
genomes, use one of the following alternatives:
\itemize{
\item Create the SpliceWiz reference without using Mappability Exclusion regions.
To do this, simply run \code{buildRef()} and omit \code{MappabilityRef}. This is
acceptable assuming the introns assessed are short and do not contain
intronic repeats
\item Calculating Mappability Exclusion regions using the STAR aligner,
and building the SpliceWiz reference. This can be done using the
\code{buildFullRef()} function, on systems where \code{STAR} is installed
\item Instead of using the STAR aligner, any genome splice-aware aligner could be
used. See \link{Mappability-methods} for
an example workflow using the Rsubread aligner. After producing the
\code{MappabilityExclusion.bed.gz} file (in the \code{Mappability} subfolder), run
\code{buildRef()} using this file (or simply leave it blank).
}

BED files are tab-separated text files containing 3 unnamed columns
specifying chromosome, start and end coordinates. To view an example BED
file, open the file specified in the path returned by
\code{getNonPolyARef("hg38")}

If \code{MappabilityRef}, \code{nonPolyARef} and \code{BlacklistRef} are left blank, the
following will be used (by priority):
\enumerate{
\item The previously used Mappability, non-polyA and/or Blacklist file resource
from a previous run, if available,
\item The resource implied by the \code{genome_type} parameter, if specified,
\item No resource is used.
}

\strong{To rebuild a SpliceWiz reference using existing resources}
This is typically run when updating an old resource to a new SpliceWiz
version. Simply run buildRef(), specifying the existing reference directory,
leave the \code{fasta} and \code{gtf} parameters blank, and set \code{overwrite = TRUE}.
SpliceWiz will use the previously-used resources to re-create the reference.

See examples below for common use cases.
}
\section{Functions}{
\itemize{
\item \code{getResources()}: Processes / downloads a copy of the
genome and gene annotations and stores this in the "resource" subdirectory
of the given reference path

\item \code{buildRef()}: First calls \code{getResources()}
(if required). Afterwards creates the SpliceWiz reference in the
given reference path

\item \code{buildFullRef()}: One-step function that fetches resources,
creates a STAR reference (including mappability calculations), then
creates the SpliceWiz reference

\item \code{getNonPolyARef()}: Returns the path to the BED file
containing coordinates of known non-polyadenylated transcripts for genomes
\code{hg38}, \code{hg19}, \code{mm10} and \code{mm9},

\item \code{getAvailableGO()}: Returns available species on Bioconductor's
AnnotationHub. Currently, only Bioconductor's OrgDb/Ensembl gene ontology
annotations are supported.

}}
\examples{
# Quick runnable example: generate a reference using SpliceWiz's example genome

example_ref <- file.path(tempdir(), "Reference")
getResources(
    reference_path = example_ref,
    fasta = chrZ_genome(),
    gtf = chrZ_gtf()
)
buildRef(
    reference_path = example_ref
)

# NB: the above is equivalent to:

example_ref <- file.path(tempdir(), "Reference")
buildRef(
    reference_path = example_ref,
    fasta = chrZ_genome(),
    gtf = chrZ_gtf()
)

# Get the path to the Non-PolyA BED file for hg19

getNonPolyARef("hg19")

# View available species for AnnotationHub's Ensembl/orgDB-based GO resources

availSpecies <- getAvailableGO()

# Build example reference with `Homo sapiens` Ens/orgDB gene ontology

ont_ref <- file.path(tempdir(), "Reference_withGO")
buildRef(
    reference_path = ont_ref,
    fasta = chrZ_genome(),
    gtf = chrZ_gtf(),
    ontologySpecies = "Homo sapiens"
)

\dontrun{

### Long examples ###

# Generate a SpliceWiz reference from user supplied FASTA and GTF files for a
# hg38-based genome:

buildRef(
    reference_path = "./Reference_user",
    fasta = "genome.fa", gtf = "transcripts.gtf",
    genome_type = "hg38"
)

# NB: Setting `genome_type = hg38`, will automatically use default
# nonPolyARef and MappabilityRef for `hg38`

# Reference generation from Ensembl's FTP links:

FTP <- "ftp://ftp.ensembl.org/pub/release-94/"
buildRef(
    reference_path = "./Reference_FTP",
    fasta = paste0(FTP, "fasta/homo_sapiens/dna/",
        "Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz"),
    gtf = paste0(FTP, "gtf/homo_sapiens/",
        "Homo_sapiens.GRCh38.94.chr.gtf.gz"),
    genome_type = "hg38"
)

# Get AnnotationHub record names for Ensembl release-94:

# First, search for the relevant AnnotationHub record names:

ah <- AnnotationHub::AnnotationHub()
AnnotationHub::query(ah, c("Homo Sapiens", "release-94"))

buildRef(
    reference_path = "./Reference_AH",
    fasta = "AH65745",
    gtf = "AH64631",
    genome_type = "hg38"
)

# Build a SpliceWiz reference, setting chromosome aliases to allow
# this reference to process BAM files aligned to UCSC-style genomes:

chrom.df <- GenomeInfoDb::genomeStyles()$Homo_sapiens

buildRef(
    reference_path = "./Reference_UCSC",
    fasta = "AH65745",
    gtf = "AH64631",
    genome_type = "hg38",
    chromosome_aliases = chrom.df[, c("Ensembl", "UCSC")]
)

# One-step generation of SpliceWiz and STAR references, using 4 threads.
# NB1: requires a linux-based system with STAR installed.
# NB2: A STAR reference genome will be generated in the `STAR` subfolder
#      inside the given `reference_path`.
# NB3: A custom Mappability Exclusion file will be calculated using STAR
#      and will be used to generate the SpliceWiz reference.

buildFullRef(
    reference_path = "./Reference_with_STAR",
    fasta = "genome.fa", gtf = "transcripts.gtf",
    genome_type = "hg38",
    use_STAR_mappability = TRUE,
    n_threads = 4
)

# NB: the above is equivalent to running the following in sequence:

getResources(
    reference_path = "./Reference_with_STAR",
    fasta = "genome.fa", gtf = "transcripts.gtf"
)
STAR_buildRef(
    reference_path = reference_path,
    also_generate_mappability = TRUE,
    n_threads = 4
)
buildRef(
    reference_path = "./Reference_with_STAR",
    genome_type = ""
)
}
}
\seealso{
\link{Mappability-methods} for methods to calculate low mappability regions\cr\cr
\link{STAR-methods} for a list of STAR wrapper functions\cr\cr
\link[AnnotationHub]{AnnotationHub}\cr\cr
\url{https://github.com/alexchwong/SpliceWizResources} for RDS files of
Mappability Exclusion GRanges objects (for hg38, hg19, mm10 and mm9)
that can be use as input files for \code{MappabilityRef} in \code{buildRef()}.
These resources are intended for SpliceWiz users on older Bioconductor
versions (3.13 or earlier)
}
