% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/COTAN-getters.R, R/COTAN-modifiers.R,
%   R/ECD-plot.R, R/clean-method.R, R/clean-plot.R, R/librarySize-plot.R,
%   R/mitochondrialPercentage-plot.R, R/scatter-plot.R
\name{RawDataCleaning}
\alias{RawDataCleaning}
\alias{flagNotFullyExpressedGenes,COTAN-method}
\alias{flagNotFullyExpressedGenes}
\alias{flagNotFullyExpressingCells,COTAN-method}
\alias{flagNotFullyExpressingCells}
\alias{getFullyExpressedGenes,COTAN-method}
\alias{getFullyExpressedGenes}
\alias{getFullyExpressingCells,COTAN-method}
\alias{getFullyExpressingCells}
\alias{findFullyExpressedGenes,COTAN-method}
\alias{findFullyExpressedGenes}
\alias{findFullyExpressingCells,COTAN-method}
\alias{findFullyExpressingCells}
\alias{setLambda,COTAN-method}
\alias{setLambda}
\alias{setDispersion,COTAN-method}
\alias{setDispersion}
\alias{setNu,COTAN-method}
\alias{setNu}
\alias{dropGenesCells,COTAN-method}
\alias{dropGenesCells}
\alias{ECDPlot}
\alias{clean,COTAN-method}
\alias{clean}
\alias{cleanPlots}
\alias{screePlot}
\alias{cellSizePlot}
\alias{genesSizePlot}
\alias{mitochondrialPercentagePlot}
\alias{scatterPlot}
\title{Raw data cleaning}
\usage{
\S4method{flagNotFullyExpressedGenes}{COTAN}(objCOTAN)

\S4method{flagNotFullyExpressingCells}{COTAN}(objCOTAN)

\S4method{getFullyExpressedGenes}{COTAN}(objCOTAN)

\S4method{getFullyExpressingCells}{COTAN}(objCOTAN)

\S4method{findFullyExpressedGenes}{COTAN}(objCOTAN, cellsThreshold = 0.99)

\S4method{findFullyExpressingCells}{COTAN}(objCOTAN, genesThreshold = 0.99)

\S4method{setLambda}{COTAN}(objCOTAN, lambda)

\S4method{setDispersion}{COTAN}(objCOTAN, dispersion)

\S4method{setNu}{COTAN}(objCOTAN, nu)

\S4method{dropGenesCells}{COTAN}(
  objCOTAN,
  genes = vector(mode = "character"),
  cells = vector(mode = "character")
)

ECDPlot(objCOTAN, yCut = NaN, condName = "", conditions = NULL)

\S4method{clean}{COTAN}(
  objCOTAN,
  cellsCutoff = 0.003,
  genesCutoff = 0.002,
  cellsThreshold = 0.99,
  genesThreshold = 0.99
)

cleanPlots(objCOTAN, includePCA = TRUE)

screePlot(pcaStdDev)

cellSizePlot(objCOTAN, condName = "", conditions = NULL)

genesSizePlot(objCOTAN, condName = "", conditions = NULL)

mitochondrialPercentagePlot(
  objCOTAN,
  genePrefix = "^MT-",
  condName = "",
  conditions = NULL
)

scatterPlot(objCOTAN, condName = "", conditions = NULL, splitSamples = TRUE)
}
\arguments{
\item{objCOTAN}{a \code{COTAN} object}

\item{cellsThreshold}{any gene that is expressed in more cells than threshold
times the total number of cells will be marked as \strong{fully-expressed}.
Default threshold is \eqn{0.99 \; (99.0\%)}}

\item{genesThreshold}{any cell that is expressing more genes than threshold
times the total number of genes will be marked as \strong{fully-expressing}.
Default threshold is \eqn{0.99 \; (99.0\%)}}

\item{lambda}{a named array that gives the values for lambda}

\item{dispersion}{a named array that gives the values for the dispersion}

\item{nu}{A named array that gives}

\item{genes}{an array of gene names}

\item{cells}{an array of cell names}

\item{yCut}{y threshold of library size to drop. Default is \code{NaN}}

\item{condName}{The name of a condition in the \code{COTAN} object to further
separate the cells in more sub-groups. When no condition is given it is
assumed to be the same for all cells (no further sub-divisions)}

\item{conditions}{The \emph{conditions} to use. If given it will take precedence
on the one indicated by \code{condName} that will only indicate the relevant
column name in the returned \code{data.frame}}

\item{cellsCutoff}{\code{clean()} will delete from the \code{raw} data any gene that is
expressed in less cells than threshold times the total number of cells.
Default cutoff is \eqn{0.003 \; (0.3\%)}}

\item{genesCutoff}{\code{clean()} will delete from the \code{raw} data any cell that is
expressing less genes than threshold times the total number of genes.
Default cutoff is \eqn{0.002 \; (0.2\%)}}

\item{includePCA}{a \code{Boolean} flag to determine whether to calculate the
\emph{PCA} associated with the normalized matrix. When \code{TRUE} the first four
elements of the returned list will be \code{NULL}}

\item{pcaStdDev}{a \code{vector} with the standard deviations of the various
components}

\item{genePrefix}{Prefix for the mitochondrial genes (default "^MT-" for
Human, mouse "^mt-")}

\item{splitSamples}{Boolean. Whether to plot each sample in a different panel
(default \code{FALSE})}
}
\value{
\code{flagNotFullyExpressedGenes()} returns a Booleans array with TRUE
for genes that are not fully-expressed

\code{flagNotFullyExpressingCells()} returns an array of Booleans with
TRUE for cells that are not expressing all genes

\code{getFullyExpressedGenes()} returns an array containing all genes
that are expressed in all cells

\code{getFullyExpressingCells()} returns an array containing all cells
that express all genes

\code{findFullyExpressedGenes()} returns the given \code{COTAN} object with
updated \strong{fully-expressed} genes' information

\code{findFullyExpressingCells()} returns the given \code{COTAN} object  with
updated \strong{fully-expressing} cells' information

\code{setLambda()} returns the updated \code{COTAN} object

\code{setDispersion()} returns the updated \code{COTAN} object

\code{setNu()} returns the updated \code{COTAN} object

\code{dropGenesCells()} returns a completely new \code{COTAN} object with the
new raw data obtained after the indicated genes/cells were expunged. All
remaining data is dropped too as no more relevant with the restricted
matrix. Exceptions are:
\itemize{
\item the meta-data for the data-set that gets kept unchanged
\item the meta-data of genes/cells that gets restricted to the remaining
elements. The columns calculated via \code{estimate} and \code{find} methods are
dropped too
}

\code{ECDPlot()} returns an \code{ECD} plot

\code{clean()} returns the updated \code{COTAN} object

\code{cleanPlots()} returns a \code{list} of \code{ggplot2} plots:
\itemize{
\item \code{"pcaCells"} is for \code{PCA} cells
\item \code{"pcaCellsData"} is the data of the \code{PCA} cells (can be plotted)
\item \code{"genes"} is for \code{B} group cells' genes
\item \code{"UDE"} is for cells' \code{UDE} against their \code{PCA}
\item \code{"nu"} is for cell \code{nu}
\item \code{"zoomedNu"} is the same but zoomed on the left and with an estimate
for the low \code{nu} threshold that defines problematic cells
}

\code{screePlot()} returns a \code{ggplot2} plot for the explained variances

\code{cellSizePlot()} returns a \code{half-violin-boxplot} object

\code{genesSizePlot()} returns a \code{half-violin-boxplot} object

\code{mitochondrialPercentagePlot()} returns a \code{list} with:
\itemize{
\item \code{"plot"} a \code{half-violin-boxplot} object
\item \code{"sizes"} a sizes \code{data.frame}
}

\code{scatterPlot()} returns the scatter plot
}
\description{
These methods are to be used to clean the raw data. That is drop
any number of genes/cells that are too sparse or too present to allow
proper calibration of the \code{COTAN} model.

We call genes that are expressed in all cells \emph{Fully-Expressed} while cells
that express all genes in the data are called \emph{Fully-Expressing}. In case
it has been made quite easy to exclude the flagged genes/cells in the user
calculations.
}
\details{
\code{flagNotFullyExpressedGenes()} returns a Boolean array with TRUE for
those genes that are not fully-expressed.

\code{flagNotFullyExpressingCells()}returns a Boolean vector with TRUE
for those cells that are not expressing all genes

\code{getFullyExpressedGenes()} returns the genes expressed in all cells
of the dataset

\code{getFullyExpressingCells()} returns the cells that did express
all genes of the dataset

\code{findFullyExpressedGenes()} determines the fully-expressed genes
inside the raw data

\code{findFullyExpressingCells()} determines the cells that are
expressing all genes in the dataset

\code{setLambda()} adds a column to the genes' metadata with the lambda
(genes' counts averages) for the given batch

\code{setDispersion()} adds a column to the genes' metadata with the
negative binomial dispersion factor for the given batch

\code{setNu()}

\code{dropGenesCells()} removes an array of genes and/or cells from the
current \code{COTAN} object.

\code{ECDPlot()} plots the \emph{Empirical Cumulative Distribution} function
of library sizes (\code{UMI} number). It helps to define where to drop "cells"
that are simple background signal.

\code{clean()} is the main method that can be used to check and clean the
dataset. It will discard any genes that has less than 3 non-zero counts per
thousand cells and all cells expressing less than 2 per thousand genes.
also produces and stores the estimators for \code{nu}

\code{cleanPlots()} creates the plots associated to the output of the
\code{\link[=clean]{clean()}} method.

\code{screePlot()} creates a plots showing the explained variance of the
components of a PCA

\code{cellSizePlot()} plots the raw library size for each cell and
sample.

\code{genesSizePlot()} plots the raw gene number (reads > 0) for each
cell and sample

\code{mitochondrialPercentagePlot()} plots the raw library size for each
cell and sample.

\code{scatterPlot()} creates a plot that check the relation between the
library size and the number of genes detected.
}
\examples{
library(zeallot)

data("test.dataset")
objCOTAN <- COTAN(raw = test.dataset)

genes.to.rem <- getGenes(objCOTAN)[grep('^MT', getGenes(objCOTAN))]
cells.to.rem <- getCells(objCOTAN)[which(getCellsSize(objCOTAN) == 0)]
objCOTAN <- dropGenesCells(objCOTAN, genes.to.rem, cells.to.rem)

objCOTAN <- clean(objCOTAN)

objCOTAN <- findFullyExpressedGenes(objCOTAN)
goodPos <- flagNotFullyExpressedGenes(objCOTAN)

objCOTAN <- findFullyExpressingCells(objCOTAN)
goodPos <- flagNotFullyExpressingCells(objCOTAN)

feGenes <- getFullyExpressedGenes(objCOTAN)

feCells <- getFullyExpressingCells(objCOTAN)

## These plots might help to identify genes/cells that need to be dropped
ecdPlot <- ECDPlot(objCOTAN, yCut = 100.0)
plot(ecdPlot)

# This creates many infomative plots useful to determine whether
# there is still something to drop...
# Here we use the tuple-like assignment feature of the `zeallot` package
clPlots <- cleanPlots(objCOTAN)
plot(clPlots[["pcaCells"]])
plot(clPlots[["UDE"]])
plot(clPlots[["zoomedNu"]])

lsPlot <- cellSizePlot(objCOTAN)
plot(lsPlot)

gsPlot <- genesSizePlot(objCOTAN)
plot(gsPlot)

mitPercPlot <-
  mitochondrialPercentagePlot(objCOTAN, genePrefix = "g-0000")[["plot"]]
plot(mitPercPlot)

scPlot <- scatterPlot(objCOTAN)
plot(scPlot)

}
