% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/motif_enrichment_monaLisa.R
\name{calcBinnedMotifEnrR}
\alias{calcBinnedMotifEnrR}
\title{Binned Motif Enrichment Analysis with \code{monaLisa}}
\usage{
calcBinnedMotifEnrR(
  seqs,
  bins = NULL,
  pwmL = NULL,
  background = c("otherBins", "allBins", "zeroBin", "genome"),
  test = c("fisher", "binomial"),
  maxFracN = 0.7,
  maxKmerSize = 3L,
  min.score = 10,
  matchMethod = "matchPWM",
  GCbreaks = c(0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8),
  pseudocount.log2enr = 8,
  p.adjust.method = "BH",
  genome = NULL,
  genome.regions = NULL,
  genome.oversample = 2,
  BPPARAM = SerialParam(),
  verbose = FALSE,
  ...
)
}
\arguments{
\item{seqs}{\code{\link[Biostrings]{DNAStringSet}} object with sequences to
test}

\item{bins}{Factor of the same length and order as \code{seqs}, indicating
the bin for each sequence. Typically the return value of
\code{\link[monaLisa]{bin}}. For \code{background = "genome"}, \code{bins}
can be omitted.}

\item{pwmL}{\code{PWMatrixList} with motifs for which to calculate 
enrichments.}

\item{background}{A \code{character} scalar specifying the background
sequences to use. One of \code{"otherBins"} (default), \code{"allBins"},
\code{"zeroBin"} or \code{"genome"} (see "Details").}

\item{test}{A \code{character} scalar specifying the type of enrichment test
to perform. One of \code{"fisher"} (default) or \code{"binomial"}. The
enrichment test is one-sided (enriched in foreground).}

\item{maxFracN}{A numeric scalar with the maximal fraction of N bases allowed
in a sequence (defaults to 0.7). Sequences with higher fractions are
excluded from the analysis.}

\item{maxKmerSize}{The maximum k-mer size to consider, when adjusting
background sequence weights for k-mer composition compared to the
foreground sequences. The default value (3) will correct for mono-, di-
and tri-mer composition.}

\item{min.score}{The minimal score for motif hits, used in
\code{\link[monaLisa]{findMotifHits}}.}

\item{matchMethod}{The method used to scan for motif hits, passed to the
\code{method} parameter in \code{\link[monaLisa]{findMotifHits}}.}

\item{GCbreaks}{The breaks between GC bins. The default value is based on
the hard-coded bins used in Homer.}

\item{pseudocount.log2enr}{A numerical scalar with the pseudocount to add to
foreground and background counts when calculating log2 motif enrichments}

\item{p.adjust.method}{A character scalar selecting the p value adjustment
method (used in \code{\link[stats]{p.adjust}}).}

\item{genome}{A \code{BSgenome} or \code{DNAStringSet} object with the
genome sequence. Only used for \code{background = "genome"} for extracting
background sequences.}

\item{genome.regions}{An optional \code{\link[GenomicRanges]{GRanges}} object
defining the intervals in \code{genome} from which background sequences are
sampled for \code{background = "genome"}. If \code{NULL}, background
sequences are sampled randomly from \code{genome}.}

\item{genome.oversample}{A \code{numeric} scalar of at least 1.0 defining how
many background sequences will be sampled per foreground sequence for
\code{background = "genome"}. Larger values will take longer but improve
the sequence composition similarity between foreground and background
(see \code{"Details"}).}

\item{BPPARAM}{An optional \code{\link[BiocParallel]{BiocParallelParam}}
instance determining the parallel back-end to be used during evaluation.}

\item{verbose}{A logical scalar. If \code{TRUE}, print progress messages.}

\item{...}{Additional arguments for  \code{\link[monaLisa]{findMotifHits}}.}
}
\value{
A \code{\link[SummarizedExperiment]{SummarizedExperiment}} object
  with motifs in rows and bins in columns, containing seven assays: \describe{
  \item{negLog10P}{: -log10 P values}
  \item{negLog10Padj}{: -log10 adjusted P values}
  \item{pearsonResid}{: motif enrichments as Pearson residuals}
  \item{expForegroundWgtWithHits}{: expected number of foreground
    sequences with motif hits}
  \item{log2enr}{: motif enrichments as log2 ratios}
  \item{sumForegroundWgtWithHits}{: Sum of foreground sequence weights
    in a bin that have motif hits}
  \item{sumBackgroundWgtWithHits}{: Sum of background sequence weights
    in a bin that have motif hits}
}
The \code{rowData} of the object contains annotations (name, PFMs, PWMs
and GC fraction) for the motifs, while the \code{colData} slot contains
summary information about the bins.
}
\description{
This function performs a motif enrichment analysis on bins of
  sequences. For each bin, the sequences in all other bins are used as
  background.
}
\details{
This function implements a binned motif enrichment analysis. In each
  enrichment analysis, the sequences in a specific bin are used as foreground
  sequences to test for motif enrichments comparing to background sequences
  (defined by \code{background}, see below). The logic follows the
  \code{findMotifsGenome.pl} tool from \code{Homer} version 4.11, with
  \code{-size given -nomotif -mknown} and additionally \code{-h} if using
  \code{test = "fisher"}, and gives very similar results. As in the
  \code{Homer} tool, sequences are weighted to correct for GC and k-mer
  composition differences between fore- and background sets.

  The background sequences are defined according to the value of the
  \code{background} argument:
  \describe{
    \item{otherBins}{: sequences from all other bins (excluding the current
      bin)}
    \item{allBins}{: sequences from all bins (including the current bin)}
    \item{zeroBin}{: sequences from the "zero bin", defined by the
      \code{maxAbsX} argument of \code{\link[monaLisa]{bin}}. If \code{bins}
      does not define a "zero bin", for example because it was created by
      \code{bin(..., maxAbsX = NULL)}, selecting this background definition
      will abort with an error.}
    \item{genome}{: sequences randomly sampled from the genome (or the
      intervals defined in \code{genome.regions} if given). For each
      foreground sequence, \code{genome.oversample} background sequences
      of the same size are sampled (on average). From these, one per
      foreground sequence is selected trying to match the G+C composition.
      In order to make the sampling deterministic, a seed number needs to be
      provided to the \code{RNGseed} parameter in
      \code{\link[BiocParallel]{SerialParam}}
      or \code{\link[BiocParallel]{MulticoreParam}} when creating the
      \code{BiocParallelParam} instance in \code{BPPARAM}.}
  }

  Motif hits are predicted using \code{\link[monaLisa]{findMotifHits}} and
  multiple hits per sequence are counted as just one hit (ZOOPS mode). For
  each motif, the weights of sequences that have a hit are summed separately
  for foreground (\code{sumForegroundWgtWithHits}) and background
  (\code{sumBackgroundWgtWithHits}). The total foreground
  (\code{totalWgtForeground}) and background (\code{totalWgtBackground})
  sum of sequence weights is also calculated. If a motif has zero
  \code{sumForegroundWgtWithHits} and \code{sumBackgroundWgtWithHits},
  then any values (p-values and enrichment) that are calculated using
  these two numbers are set to NA.

  Two statistical tests for the calculation of enrichment log p-value are
  available: \code{test = "fisher"} (default) to perform Fisher's exact
  tests, or \code{test = "binomial"} to perform binomial tests
  (default in \code{Homer}), using:
  \describe{
    \item{fisher}{: \code{fisher.test(x = tab, alternative =
      "greater")}, where \code{tab} is the contingency table with the summed
      weights of sequences in foreground or background sets (rows), and with
      or without a hit for a particular motif (columns).}
    \item{binomial}{: \code{pbinom(q = sumForegroundWgtWithHits - 1, size =
      totalWgtForeground,
      prob = sumBackgroundWgtWithHits / totalWgtBackground,
      lower.tail = FALSE, log.p = TRUE)}}
  }
}
\examples{
seqs <- Biostrings::DNAStringSet(c("GTCAGTCGATC", "CAGTCTAGCTG",
                                   "CGATCGTCAGT", "AGCTGCAGTCT"))
bins <- factor(rep(1:2, each = 2))
m <- rbind(A = c(2, 0, 0),
           C = c(1, 1, 0),
           G = c(0, 2, 0),
           T = c(0, 0, 3))
pwms <- TFBSTools::PWMatrixList(
    TFBSTools::PWMatrix(ID = "m1", profileMatrix = m),
    TFBSTools::PWMatrix(ID = "m2", profileMatrix = m[, 3:1])
)
calcBinnedMotifEnrR(seqs = seqs, bins = bins, pwmL = pwms,
                    min.score = 3)

}
