% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/compare2Sequences.R
\name{compare2Sequences}
\alias{compare2Sequences}
\title{Compare two input sequences/sequence sets for possible guide RNAs (gRNAs)}
\usage{
compare2Sequences(
  inputFile1Path = NULL,
  inputFile2Path = NULL,
  inputNames = c("Seq1", "Seq2"),
  format = c("fasta", "fasta"),
  header = FALSE,
  findgRNAsWithREcutOnly = FALSE,
  searchDirection = c("both", "1to2", "2to1"),
  BSgenomeName = NULL,
  baseEditing = FALSE,
  targetBase = "C",
  editingWindow = 4:8,
  editingWindow.offtargets = 4:8,
  REpatternFile = REpatternFile_default(),
  minREpatternSize = 6,
  findgRNAs = c(TRUE, TRUE),
  removegRNADetails = c(FALSE, FALSE),
  exportAllgRNAs = c("no", "all", "fasta", "genbank"),
  annotatePaired = FALSE,
  overlap.gRNA.positions = c(17, 18),
  findPairedgRNAOnly = FALSE,
  min.gap = 0,
  max.gap = 20,
  gRNA.name.prefix = "_gR",
  PAM.size = 3,
  gRNA.size = 20,
  PAM = "NGG",
  PAM.pattern = "NNG$|NGN$",
  allowed.mismatch.PAM = 1,
  max.mismatch = 3,
  outputDir = NULL,
  upstream = 0,
  downstream = 0,
  weights = weights_default,
  overwrite = FALSE,
  baseBeforegRNA = 4,
  baseAfterPAM = 3,
  featureWeightMatrixFile = featureWeightMatrixFile_default(),
  foldgRNAs = FALSE,
  gRNA.backbone = gRNA.backbone_default,
  temperature = 37,
  scoring.method = c("Hsu-Zhang", "CFDscore"),
  subPAM.activity = subPAM.activity_default,
  subPAM.position = c(22, 23),
  PAM.location = "3prime",
  rule.set = c("Root_RuleSet1_2014", "Root_RuleSet2_2016", "CRISPRscan", "DeepCpf1"),
  mismatch.activity.file = mismatch.activity.file_default()
)
}
\arguments{
\item{inputFile1Path}{Sequence input file 1 path that contains one of the
two sequences to be searched for potential gRNAs. It can also be a
DNAStringSet object with names field set. Please see examples below.}

\item{inputFile2Path}{Sequence input file 2 path that contains one of the
two sequences to be searched for potential gRNAs. It can also be a
DNAStringSet object with names field set. Please see examples below.}

\item{inputNames}{Name of the input sequences when inputFile1Path and
inputFile2Path are DNAStringSet instead of file path}

\item{format}{Format of the input files, fasta, fastq and bed format are
supported, default fasta}

\item{header}{Indicate whether the input file contains header, default
FALSE, only applies to bed format}

\item{findgRNAsWithREcutOnly}{Indicate whether to find gRNAs overlap with
restriction enzyme recognition pattern}

\item{searchDirection}{Indicate whether perfrom gRNA in both sequences and
off-target search against each other (both) or search gRNA in input1 and
off-target analysis in input2 (1to2), or vice versa (2to1)}

\item{BSgenomeName}{BSgenome object. Please refer to available.genomes in
BSgenome package. For example, BSgenome.Hsapiens.UCSC.hg19 for hg19,
BSgenome.Mmusculus.UCSC.mm10 for mm10, BSgenome.Celegans.UCSC.ce6 for ce6,
BSgenome.Rnorvegicus.UCSC.rn5 for rn5, BSgenome.Drerio.UCSC.danRer7 for Zv9,
and BSgenome.Dmelanogaster.UCSC.dm3 for dm3}

\item{baseEditing}{Indicate whether to design gRNAs for base editing.
Default to FALSE If TRUE, please set baseEditing = TRUE, targetBase and
editingWidow accordingly.}

\item{targetBase}{Applicable only when baseEditing is set to TRUE. It is
used to indicate the target base for base editing systems, default to C for
converting C to T in the CBE system. Please change it to A if you intend to
use the ABE system.}

\item{editingWindow}{Applicable only when baseEditing is set to TRUE. It is
used to indicate the effective editing window, default to 4 to 8 which is
for the original CBE system. Please change it accordingly if the system you
use have a different editing window.}

\item{editingWindow.offtargets}{Applicable only when baseEditing is set to
TRUE. It is used to indicate the effective editing window to consider for
the offtargets search only, default to 4 to 8 which is for the original CBE
system. Please change it accordingly if the system you use have a different
editing window, or you would like to include offtargets with the target base
in a larger editing window.}

\item{REpatternFile}{File path containing restriction enzyme cut patters}

\item{minREpatternSize}{Minimum restriction enzyme recognition pattern
length required for the enzyme pattern to be searched for, default 6}

\item{findgRNAs}{Indicate whether to find gRNAs from the sequences in the
input file or skip the step of finding gRNAs, default TRUE for both input
sequences.  Set it to FALSE if the input file contains user selected gRNAs
plus PAM already.}

\item{removegRNADetails}{Indicate whether to remove the detailed gRNA
information such as efficacy file and restriction enzyme cut sites, default
false for both input sequences. Set it to TRUE if the input file contains
the user selected gRNAs plus PAM already.}

\item{exportAllgRNAs}{Indicate whether to output all potential gRNAs to a
file in fasta format, genbank format or both. Default to no.}

\item{annotatePaired}{Indicate whether to output paired information, default
to FALSE}

\item{overlap.gRNA.positions}{The required overlap positions of gRNA and
restriction enzyme cut site, default 17 and 18}

\item{findPairedgRNAOnly}{Choose whether to only search for paired gRNAs in
such an orientation that the first one is on minus strand called reverse
gRNA and the second one is on plus strand called forward gRNA. TRUE or
FALSE, default FALSE}

\item{min.gap}{Minimum distance between two oppositely oriented gRNAs to be
valid paired gRNAs. Default 0}

\item{max.gap}{Maximum distance between two oppositely oriented gRNAs to be
valid paired gRNAs. Default 20}

\item{gRNA.name.prefix}{The prefix used when assign name to found gRNAs,
default _gR, short for guided RNA.}

\item{PAM.size}{PAM length, default 3}

\item{gRNA.size}{The size of the gRNA, default 20}

\item{PAM}{PAM sequence after the gRNA, default NGG}

\item{PAM.pattern}{Regular expression of PAM, default NNG or NGN for spCas9.
For cpf1, ^TTTN since it is a 5 prime PAM sequence}

\item{allowed.mismatch.PAM}{Maximum number of mismatches allowed to the PAM
sequence, default to 1 for PAM.pattern NNG or NGN PAM}

\item{max.mismatch}{Maximum mismatch allowed to search the off targets in
the other sequence, default 3}

\item{outputDir}{the directory where the sequence comparison results will be
written to}

\item{upstream}{upstream offset from the bed input starts to search for gRNA
and/or offtargets, default 0}

\item{downstream}{downstream offset from the bed input ends to search for
gRNA and/or offtargets, default 0}

\item{weights}{numeric vector size of gRNA length, default c(0, 0, 0.014, 0,
0, 0.395, 0.317, 0, 0.389, 0.079, 0.445, 0.508, 0.613, 0.851, 0.732, 0.828,
0.615, 0.804, 0.685, 0.583) which is used in Hsu et al., 2013 cited in the
reference section}

\item{overwrite}{overwrite the existing files in the output directory or
not, default TRUE}

\item{baseBeforegRNA}{Number of bases before gRNA used for calculating gRNA
efficiency, default 4 Please note, for PAM located on the 5 prime, need to
specify the number of bases before the PAM sequence plus PAM size.}

\item{baseAfterPAM}{Number of bases after PAM used for calculating gRNA
efficiency, default 3 for spCas9 Please note, for PAM located on the 5
prime, need to include the length of the gRNA plus the extended sequence on
the 3 prime}

\item{featureWeightMatrixFile}{Feature weight matrix file used for
calculating gRNA efficiency. By default DoenchNBT2014 weight matrix is used.
To use alternative weight matrix file, please input a csv file with first
column containing significant features and the second column containing the
corresponding weights for the features. Please see Doench et al., 2014 for
details.}

\item{foldgRNAs}{Default FALSE. If set to TRUE, summary file will contain
minimum free energy of the secondary structure of gRNA with gRNA backbone
from GeneRfold package provided that GeneRfold package has been installed.}

\item{gRNA.backbone}{gRNA backbone constant region sequence. Default to the
sequence in Sp gRNA backbone.}

\item{temperature}{temperature in celsius. Default to 37 celsius.}

\item{scoring.method}{Indicates which method to use for offtarget cleavage
rate estimation, currently two methods are supported, Hsu-Zhang and CFDscore}

\item{subPAM.activity}{Applicable only when scoring.method is set to
CFDscore A hash to represent the cleavage rate for each alternative sub PAM
sequence relative to preferred PAM sequence}

\item{subPAM.position}{Applicable only when scoring.method is set to
CFDscore The start and end positions of the sub PAM. Default to 22 and 23
for SP with 20bp gRNA and NGG as preferred PAM}

\item{PAM.location}{PAM location relative to gRNA. For example, spCas9 PAM
is located on the 3 prime (3prime) while cpf1 PAM is located on the 5 prime
(5prime)}

\item{rule.set}{Specify a rule set scoring system for calculating gRNA
efficacy. Please note that Root_RuleSet2_2016 requires the following python
packages with specified verion and python 2.7.  1. scikit-learn 0.16.1 2.
pickle 3. pandas 4. numpy 5. scipy}

\item{mismatch.activity.file}{Applicable only when scoring.method is set to
CFDscore A comma separated (csv) file containing the cleavage rates for all
possible types of single nucleotide mismatche at each position of the gRNA.
By default, using the supplemental Table 19 from Doench et al., Nature
Biotechnology 2016}
}
\value{
Return a data frame with all potential gRNAs from both sequences. In
addition, a tab-delimited file `scoresFor2InputSequences.xlsx` is also saved in
the `outputDir`, sorted by `scoreDiff` descending.
\itemize{
 \item name - name of the gRNA
 \item gRNAPlusPAM - gRNA plus PAM sequence
 \item targetInSeq1 - target/off-target sequence including PAM in the 1st
 input sequence file
 \item targetInSeq2 - target/off-target sequence including PAM in the 2nd 
 input sequence file
 \item guideAlignment2Offtarget - alignment of gRNA to the other input 
 sequence (off-target sequence)
 \item offTargetStrand - strand of the other sequence (off-target sequence) 
 the gRNA aligns to
 \item scoreForSeq1 - score for the target sequence in the 1st input 
 sequence file
 \item scoreForSeq2 - score for the target sequence in the 2nd input 
 sequence file
 \item mismatch.distance2PAM - distances of mismatch to PAM, e.g., 14 
 means the mismatch is 14 bp away from PAM
 \item n.mismatch - number of mismatches between the off-target and the gRNA
 \item targetSeqName - the name of the input sequence where the target 
 sequence is located
 \item scoreDiff - scoreForSeq1 - scoreForSeq2
 \item bracket.notation - folded gRNA in bracket notation
 \item mfe.sgRNA - minimum free energy of sgRNA
 \item mfe.diff - mfe.sgRNA - mfe.backbone
 \item mfe.backbone - minimum free energy of the gRNA backbone by itself
}
}
\description{
Generate all possible guide RNAs (gRNAs) for two input sequences, or two
sets of sequences, and generate scores for potential off-targets in the other
sequence.
}
\examples{

    library(CRISPRseek)
    inputFile1Path <- system.file("extdata", "rs362331T.fa",
            package = "CRISPRseek")
    inputFile2Path <- system.file("extdata", "rs362331C.fa",
            package = "CRISPRseek")
    REpatternFile <- system.file("extdata", "NEBenzymes.fa", 
            package = "CRISPRseek")
    outputDir <- tempdir()
    seqs <- compare2Sequences(inputFile1Path, inputFile2Path,
        outputDir = outputDir, 
        REpatternFile = REpatternFile, overwrite = TRUE)

    seqs2 <- compare2Sequences(inputFile1Path, inputFile2Path,
               inputNames=c("Seq1", "Seq2"),
               scoring.method = "CFDscore",
               outputDir = outputDir, 
               overwrite = TRUE, baseEditing = TRUE)

    inputFile1Path <- 
DNAStringSet(
"TAATATTTTAAAATCGGTGACGTGGGCCCAAAACGAGTGCAGTTCCAAAGGCACCCACCTGTGGCAG"
)
    ## when set inputFile1Path to a DNAStringSet object, it is important
    ## to call names
    names(inputFile1Path) <- "seq1"
    
    inputFile2Path <- 
DNAStringSet(
"TAATATTTTAAAATCGGTGACGTGGGCCCAAAACGAGTGCAGTTCCAAAGGCACCCACCTGTGGCAG"
)
     ## when set inputFile2Path to a DNAStringSet object, it is important 
    ## to call names

    names(inputFile2Path) <- "seq2"

    seqs <- compare2Sequences(inputFile1Path, inputFile2Path,
          inputNames=c("Seq1", "Seq2"),
          scoring.method = "CFDscore",
          outputDir = outputDir, 
          overwrite = TRUE)

    seqs2 <- compare2Sequences(inputFile1Path, inputFile2Path,
               inputNames=c("Seq1", "Seq2"),
               scoring.method = "CFDscore",
               outputDir = outputDir, 
               overwrite = TRUE, baseEditing = TRUE)
}
\references{
Patrick D Hsu, David A Scott, Joshua A Weinstein, F Ann Ran,
Silvana Konermann, Vineeta Agarwala, Yinqing Li, Eli J Fine, Xuebing Wu,
Ophir Shalem, Thomas J Cradick, Luciano A Marraffini, Gang Bao & Feng Zhang
(2013) DNA targeting specificity of rNA-guided Cas9 nucleases. Nature
Biotechnology 31:827-834
}
\seealso{
CRISPRseek
}
\author{
Lihua Julie Zhu
}
\keyword{misc}
