% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/simplify_reanno.R
\name{simplify_reanno}
\alias{simplify_reanno}
\title{Generates one-hit hierarchial biotype annotation}
\usage{
simplify_reanno(
  input,
  hierarchy,
  mismatches = 2,
  bio_name = "Biotypes",
  merge_pac = FALSE,
  target_columns = NULL
)
}
\arguments{
\item{input}{Data.frame with biotype/mismatch matrix generated by
\code{add_reanno}. A PAC-list object can also be provided here where the
Anno data.frame will be used as input. Important, PAC objects need biotype
columns generated by \code{\link{add_reanno}}.}

\item{hierarchy}{List with character vectors. The vectors search terms
constructed as regular expressions that will be parsed to grepl() and
searched for in the mismatch biotype reannotation matrix (colnames:
mis0_bio, mis1_bio, mis2_bio, mis3_bio). Important \emph{hierarchy} is order
sensitive. For example if \emph{hierarchy} is defined as list(rRNA="rRNA",
miRNA="miRNA", piRNA="piRNA") a sequence annotated with 0 mismatches for all
three biotypes will only be reported as rRNA since this biotype was put
first in the list. Sequences annotated to neither of the biotypes listed in
\emph{hierarchy}, that is still annotated to something will be assigned as
"other". Sequences without an annotation down to the specified mismatch
level (\code{mismatches} will be assigned as "no_anno".}

\item{mismatches}{Integer indicating the number of allowed mismatches. Note
that this value can never be larger than the maximum number of mismatches
used in the reannotation workflow (default=0).}

\item{bio_name}{Character naming the final biotype column (default="Biotype")}

\item{merge_pac}{Logical whether the simplified annotation column should
automatically be added to the Anno object if a PAC list object were given as
input (default=FALSE).}

\item{target_columns}{Character vector naming the target columns for the
reannotation matrix. When \code{target_columns=NULL}(default), the
function assumes that one (and only one) reannotation matrix for biotypes
generatated by \code{\link{add_reanno}} is present in anno(PAC) (colnames =
mis0_bio, mis1_bio, mis2_bio etc). With \code{target_columns} two
alternative reannotation matrices can be discriminated.}
}
\value{
Character vector with single best-hit biotypes with fewest mismatches.
}
\description{
\code{simplify_reanno} adds hierarchical classification to PAC.
}
\details{
This function deals with and can be used to explores the effect of
multimapping. Given a biotype reannotation matrix (colnames = mis0_bio,
mis1_bio, mis2_bio etc) created by \code{\link{add_reanno}} and usually added
to the Anno data.frame of a PAC object, the function will generated a one-hit
biotype vector according to a user-defined biotype hierarchy.
}
\examples{
#' ######################################################### 
##### hierarchical classification using simplify_reanno
closeAllConnections()
######################################################### 
##### Create an reanno object

### First, if you haven't already generated Bowtie indexes for the included
# fasta references you need to do so. If you are having problem see the small
# RNA guide (vignette) for more info.
                                                             
 ## tRNA:
 trna_file <- system.file("extdata/trna", "tRNA.fa", 
                          package = "seqpac", mustWork = TRUE)
 trna_dir <- dirname(trna_file)
 
 if(!sum(stringr::str_count(list.files(trna_dir), ".ebwt")) ==6){     
     Rbowtie::bowtie_build(trna_file, 
                           outdir=trna_dir, 
                           prefix="tRNA", force=TRUE)
                           }
 ## rRNA:
 rrna_file <- system.file("extdata/rrna", "rRNA.fa", 
                          package = "seqpac", mustWork = TRUE)
 rrna_dir <- dirname(rrna_file)
 
 if(!sum(stringr::str_count(list.files(rrna_dir), ".ebwt")) ==6){
     Rbowtie::bowtie_build(rrna_file, 
                           outdir=rrna_dir, 
                           prefix="rRNA", force=TRUE)
                           }
                           
                           
##  Then load a PAC-object and remove previous mapping from anno:
 load(system.file("extdata", "drosophila_sRNA_pac_filt_anno.Rdata", 
                   package = "seqpac", mustWork = TRUE))
 anno(pac) <- anno(pac)[,1, drop = FALSE]

                                    
##  You may add an output path of your choice, but here we use a temp folder:
 output <- paste0(tempdir(),"/seqpac/test")
 ref_paths <- list(trna= trna_file, rrna= rrna_file)
 
##  Then map the PAC-object against the fasta references. 
# Warning: if you use your own data, you may want to use override=FALSE, to avoid
# deleting previous mapping by mistake.

map_reanno(pac, input=ref_paths, output=output,
               type="internal", mismatches=1,  import="biotype", 
               threads=2, keep_temp=FALSE, override=TRUE)
 
##  Then import and generate a reanno-object of the temporary bowtie-files
reanno_biotype <- make_reanno(output, PAC=pac, mis_fasta_check = TRUE)
 
                                    
## Now make some search terms against reference names to create shorter names
# Theses can be used to create factors in downstream analysis
# One search hit (regular expressions) gives one new short name 

bio_search <- list(
              rrna=c("5S", "5.8S", "12S", "16S", "18S", "28S", "pre_S45"),
              trna =c("_tRNA", "mt:tRNA"))
 
 
## You can merge directly with your PAC-object by adding your original 
# PAC-object, that you used with map_reanno, to merge_pac option.

pac <- add_reanno(reanno_biotype, bio_search=bio_search, 
                       type="biotype", bio_perfect=FALSE, 
                       mismatches = 1, merge_pac=pac)

## Hierarchical classification with simplify_reanno
table(anno(pac)$mis0_bio)
table(anno(pac)$mis1_bio)

# Setup hierarchy(rRNA prioritized before tRNA) 
hierarchy_1 <- list(rRNA="rrna_",
                  tRNA="trna_")

# Setup hierarchy(specifies the hierarchy among rRNA)
hierarchy_2 <- list(r18S="rrna_18S",
                    r28S="rrna_28S",
                    r5.8S="rrna_5.8S",
                    r5S="rrna_5S",
                    pre45S="pre_S45",
                    other_rrna="rrna_other") 

# No mistmach for hierarchy_1:
test1 <- simplify_reanno(input=pac, hierarchy=hierarchy_1, mismatches=0, 
                      bio_name="Biotypes_mis0", merge_pac=FALSE)
                      
# Up to 2 mistmaches for rRNA hierarchy_2                      
test2 <- simplify_reanno(input=pac, hierarchy=hierarchy_2, mismatches=1, 
                      bio_name="rRNA_biotypes_mis2", merge_pac=FALSE)


table(test1$Biotypes_mis0) 
table(test2$rRNA_biotypes_mis2)
                     
# You can add the new column to your PAC-object using merge_pac=TRUE:
pac <- simplify_reanno(input=pac, hierarchy=hierarchy_2, mismatches=1, 
                      bio_name="rRNA_biotypes_mis1", merge_pac=TRUE)

table(anno(pac)$rRNA_biotypes_mis1)

}
\seealso{
\url{http://bowtie-bio.sourceforge.net/index.shtml} for information about
 Bowtie \url{https://github.com/OestLab/seqpac} for updates on the current package.

Other PAC reannotation: 
\code{\link{PAC_annotate}()},
\code{\link{add_reanno}()},
\code{\link{as.reanno}()},
\code{\link{import_reanno}()},
\code{\link{make_conv}()},
\code{\link{make_reanno}()},
\code{\link{map_reanno}()}
}
\concept{PAC reannotation}
