#' filter off-targets and generate reports.
#'
#' filter off-targets that meet the criteria set by users such as minimum score, 
#' topN. In addition, off target was annotated with flank sequence, gRNA 
#' cleavage efficiency and whether it is inside an exon or not if fetchSequence
#' is set to TRUE and annotateExon is set to TRUE
#'
#' @param scores a data frame output from getOfftargetScore. It contains
#' @return A data frame with details of off-targets for the given gRNA.
#' \itemize{
#'  \item strand - strand of the off-target, + for plus and - for minus strand
#'  \item chrom - chromosome of the off-target
#'  \item chromStart - start position of the off-target
#'  \item chromEnd - end position of the off-target
#'  \item name - gRNA name
#'  \item gRNAPlusPAM - gRNA sequence with PAM sequence concatenated
#'  \item OffTargetSequence - the genomic sequence of the off-target
#'  \item n.mismatch - number of mismatches between the off-target and the gRNA
#'  \item forViewInUCSC - string for viewing in UCSC genome browser, e.g., 
#'  chr14:31665685-31665707
#'  \item score - score of the off-target
#'  \item mismatch.distance2PAM - comma-separated distances of all mismatches to 
#'  PAM, e.g., 14,11 means one mismatch is 14 bp away from PAM and the other 
#'  mismatch is 11 bp away from PAM
#'  \item alignment - alignment between gRNA and off-target, 
#'  e.g., ......G..C.......... means that this off-target aligns with gRNA 
#'  except that G and C are mismatches
#'  \item NGG - whether this off-target contains canonical PAM (1 for yes, 
#'  0 for no)
#'  \item mean.neighbor.distance.mismatch - mean distance between neighboring 
#'  mismatches
#' }
#' @param min.score minimum score of an off target to included in the final
#' output, default 0.5
#' @param topN top N off targets to be included in the final output, default
#' 100
#' @param topN.OfftargetTotalScore top N off target used to calculate the total
#' off target score, default 10
#' @param annotateExon Choose whether or not to indicate whether the off target
#' is inside an exon or not, default TRUE
#' @param txdb TxDb object, for creating and using TxDb object, please refer to
#' GenomicFeatures package. For a list of existing TxDb object, please search
#' for annotation package starting with Txdb at
#' http://www.bioconductor.org/packages/release/BiocViews.html#___AnnotationData,
#' such as TxDb.Rnorvegicus.UCSC.rn5.refGene for rat,
#' TxDb.Mmusculus.UCSC.mm10.knownGene for mouse,
#' TxDb.Hsapiens.UCSC.hg19.knownGene for human,
#' TxDb.Dmelanogaster.UCSC.dm3.ensGene for Drosophila and
#' TxDb.Celegans.UCSC.ce6.ensGene for C.elegans
#' @param orgAnn organism annotation mapping such as org.Hs.egSYMBOL in
#' org.Hs.eg.db package for human
#' @param ignore.strand default to TRUE
#' @param outputDir the directory where the off target analysis and reports
#' will be written to
#' @param oneFilePergRNA write to one file for each gRNA or not, default to
#' FALSE
#' @param fetchSequence Fetch flank sequence of off target or not, default TRUE
#' @param upstream upstream offset from the off target start, default 200
#' @param downstream downstream offset from the off target end, default 200
#' @param BSgenomeName BSgenome object. Please refer to available.genomes in
#' BSgenome package. For example,
#' \itemize{
#'  \item BSgenome.Hsapiens.UCSC.hg19 - for hg19
#'  \item BSgenome.Mmusculus.UCSC.mm10 - for mm10
#'  \item BSgenome.Celegans.UCSC.ce6 - for ce6
#'  \item BSgenome.Rnorvegicus.UCSC.rn5 - for rn5
#'  \item BSgenome.Dmelanogaster.UCSC.dm3 - for dm3
#' }
#' @param genomeSeqFile Other than BSgenomeName, a custome FASTA file can be 
#' supplied, if set, overwrites BSgenomeName.
#' @param baseBeforegRNA Number of bases before gRNA used for calculating gRNA
#' efficiency, default 4
#' @param baseAfterPAM Number of bases after PAM used for calculating gRNA
#' efficiency, default 3
#' @param gRNA.size The size of the gRNA, default 20 for spCas9
#' @param PAM.location PAM location relative to gRNA. For example, spCas9 PAM
#' is located on the 3 prime while cpf1 PAM is located on the 5 prime
#' @param PAM.size PAM length, default 3 for spCas9
#' @param featureWeightMatrixFile Feature weight matrix file used for
#' calculating gRNA efficiency. By default DoenchNBT2014 weight matrix is used.
#' To use alternative weight matrix file, please input a csv file with first
#' column containing significant features and the second column containing the
#' corresponding weights for the features. Please see Doench et al., 2014 for
#' details.
#' @param rule.set Specify a rule set scoring system for calculating gRNA
#' efficacy.
#' @param chrom_acc Optional binary variable indicating chromatin accessibility
#' information with 1 indicating accessible and 0 not accessible.
#' @param calculategRNAefficacyForOfftargets Default to TRUE to output gRNA
#' efficacy for offtargets as well as ontargets. Set it to FALSE if only need
#' gRNA efficacy calculated for ontargets only to speed up the analysis. Please
#' refer to https://support.bioconductor.org/p/133538/#133661 for potential use
#' cases of offtarget efficacies.
#' @return 
#' \itemize{
#'  \item offtargets - a data frame with off-target analysis results
#'  \item summary - a data frame with summary of the off-target analysis results
#' }
#' @author Lihua Julie Zhu
#' @seealso offTargetAnalysis
#' @references Doench JG, Hartenian E, Graham DB, Tothova Z, Hegde M, Smith I,
#' Sullender M, Ebert BL, Xavier RJ, Root DE. Rational design of highly active
#' sgRNAs for CRISPR-Cas9-mediated gene inactivation. Nat Biotechnol. 2014 Sep
#' 3. doi: 10.1038 nbt.3026 Lihua Julie Zhu, Benjamin R. Holmes, Neil Aronin
#' and Michael Brodsky. CRISPRseek: a Bioconductor package to identify
#' target-specific guide RNAs for CRISPR-Cas9 genome-editing systems. Plos One
#' Sept 23rd 2014
#' @keywords misc
#' @examples
#' 
#' library(CRISPRseek)
#' library(BSgenome.Hsapiens.UCSC.hg19)
#' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' library(org.Hs.eg.db)
#' 
#' hitsFile <-  system.file("extdata", "hits.txt", package = "CRISPRseek")
#' hits <- read.table(hitsFile, sep = "\t", 
#'                    header = TRUE, 
#'                    stringsAsFactors = FALSE)
#' featureVectors <- buildFeatureVectorForScoring(hits)
#' scores <- getOfftargetScore(featureVectors)
#' 
#' outputDir <- tempdir()
#' results <- filterOffTarget(scores, 
#'                            BSgenomeName = Hsapiens, 
#'                            txdb = TxDb.Hsapiens.UCSC.hg19.knownGene,
#'                            orgAnn = org.Hs.egSYMBOL, 
#'                            outputDir = outputDir,
#'                            min.score = 0.1, 
#'                            topN = 10, 
#'                            topN.OfftargetTotalScore = 5)
#' results$offtargets
#' results$summary
#' 
#' @importFrom utils read.csv
#' @importFrom BiocGenerics unlist cbind
#' @importFrom BSgenome getSeq
#' @importFrom S4Vectors merge
#' @importFrom Seqinfo seqlengths
#' @importFrom openxlsx write.xlsx read.xlsx
#' @importFrom rlang warn
#' @export
filterOffTarget <- function(scores = NULL, 
                            min.score = 0.01, 
                            topN = 200, 
                            topN.OfftargetTotalScore = 10,
                            annotateExon = TRUE, 
                            txdb = NULL, 
                            orgAnn = NULL, 
                            ignore.strand = TRUE, 
                            outputDir = NULL, 
                            oneFilePergRNA = FALSE,
                            fetchSequence = TRUE, 
                            upstream = 200, 
                            downstream = 200, 
                            BSgenomeName = NULL,
                            genomeSeqFile = NULL,
                            baseBeforegRNA = 4, 
                            baseAfterPAM = 3, 
                            gRNA.size = 20, 
                            PAM.location = "3prime", 
                            PAM.size = 3, 
                            featureWeightMatrixFile = featureWeightMatrixFile_default(),
                            rule.set = c("Root_RuleSet1_2014", "Root_RuleSet2_2016", "CRISPRscan", "DeepCpf1"),
                            chrom_acc = NULL,
                            calculategRNAefficacyForOfftargets = TRUE) {
  # Params validations:
  rule.set <- match.arg(rule.set)

  if (featureWeightMatrixFile != "" && file.exists(featureWeightMatrixFile)) {
	  featureWeightMatrix <- read.csv(featureWeightMatrixFile, header = TRUE)
  }
  
  if (fetchSequence && (is.null(BSgenomeName) || class(BSgenomeName) != "BSgenome")) {
    stop("To fetch sequences, BSgenomeName is required as BSgenome object!")
  }
  
  if (annotateExon && ( is.null(txdb) || (class(txdb) != "TxDb" && class(txdb) != "TranscriptDb"))) {
    stop("To indicate whether an offtarget is inside an exon, txdb is required as TxDb object!")
  }
  
  if (annotateExon && is.null(orgAnn)) {
    warn("The orgAnn parameter was not specified. Please refer to the manual for instructions on using orgAnn to generate gene identifiers in the offTarget output file.")
  }
  
  prepOutputDir(outputDir, overwrite = TRUE)
  
  # Filtering offTargets:
  scores <- scores[scores$score >= min.score, ]
  scores <- scores[, !grepl("IsMismatch.pos", colnames(scores))]
  scores <- scores[, !grepl("IsDeletion.pos", colnames(scores))]
  scores <- scores[, !grepl("IsInsertion.pos", colnames(scores))]
    
  OfftargetFile <- file.path(outputDir, "OfftargetAnalysis.xlsx")
  OfftargetSummary <- file.path(outputDir, "Summary.xlsx")
  
  gRNAsPlusPAM <- unique(scores$name)
  names <- gRNAsPlusPAM
  
  top5OfftargetTotalScore <- numeric(length(names))
  topNOfftargetTotalScore <- top5OfftargetTotalScore
  
  temp <- cbind(names, gRNAsPlusPAM, top5OfftargetTotalScore, topNOfftargetTotalScore)
  mismatch.distance2PAM <- matrix(ncol = 11, nrow = length(names))
  
  Offtargets <- data.frame()
  for (i in 1:length(gRNAsPlusPAM)) {
    this.score <- scores[scores$name == gRNAsPlusPAM[i], ]
    this.score <- this.score[order(this.score$score, this.score$n.mismatch, decreasing = c(TRUE, FALSE)), ]
    
    # Fill up top5OfftargetTotalScore, topNOfftargetTotalScore:
    maxN <- min(topN + 1, dim(this.score)[1])
    this.score <- this.score[1:maxN, ]
    maxN.totalScore <- min(maxN, (topN.OfftargetTotalScore + 1))
    
    if (this.score$n.mismatch[1] == 0 && as.numeric(as.character(this.score$NGG[1])) == 1) {
      start.ind <- 2
      end.ind <- min(maxN, 6)
      end.forSummary <- 11
    } else {
      start.ind <- 1
      maxN <- maxN - 1
      maxN.totalScore <- maxN.totalScore - 1
      end.forSummary <- 10
      end.ind <- min(maxN, 5)
    }
    temp[i, 3] <- sum(this.score$score[start.ind:end.ind])
    if (maxN < maxN.totalScore) {
      temp[i,4] <- sum(this.score$score[start.ind:maxN])
    } else {
      temp[i,4] <- sum(this.score$score[start.ind:maxN.totalScore])
    }
    
    # Fill up mismatch.distance2PAM:
    temp[i,2] <- unique(this.score$gRNAPlusPAM)
    mismatch.distance2PAM[i,] <- ifelse(as.character(this.score$mismatch.distance2PAM[1]) == "", "NMM", "perfect match not found")
    # end.forSummary is 10 if no on-target found, otherwise 11

    forSummary <- this.score[start.ind:end.forSummary, ]
    forSummary <- forSummary[order(forSummary$score, decreasing = TRUE), ]
    mismatch.distance2PAM[i, 2:11] <- as.character(forSummary$mismatch.distance2PAM)
    if (dim(forSummary)[1] < 10) {
      mismatch.distance2PAM[i, (dim(forSummary)[1] +1):11] <- "NA"
    }
    
    # Output OfftargetAnalysis_per_gRNA.xlsx:
    col_all <- c("name", 
                 "gRNAPlusPAM",
                 "OffTargetSequence",
                 "score",
                 "n.mismatch",
                 "mismatch.distance2PAM",
                 "alignment",
                 "NGG",
                 "forViewInUCSC",
                 "strand",
                 "chrom",
                 "chromStart",
                 "chromEnd",
                 "gRNAPlusPAM_bulge",
                 "OffTargetSequence_bulge",
                 "n.RNABulge",
                 "n.DNABulge",
                 "gRNA.insertion",
                 "gRNA.deletion")
    this.score <- this.score[, intersect(col_all, names(this.score))]
    this.score$mismatch.distance2PAM <- as.character(this.score$mismatch.distance2PAM)
    this.score$NGG <- as.character(this.score$NGG)

    if (oneFilePergRNA & dim(this.score)[1] > 0) {
      write.xlsx(this.score[!is.na(this.score[,grep("score", colnames(this.score))]), ],
                 file = file.path(outputDir, paste0("OfftargetAnalysis-", as.character(temp[i,1]), ".xlsx")),
                 rowNames = FALSE)
    }

    if (dim(this.score)[1] > 0) {
      Offtargets <- rbind(Offtargets, this.score)
    }
  }

  # Output Summary.xlsx:
  temp <- cbind(temp, mismatch.distance2PAM)
  colnames(temp)[5] <- "top1Hit.onTarget.MMdistance2PAM"
  colnames(temp)[4] <- paste("top", topN.OfftargetTotalScore, "OfftargetTotalScore", sep = "")
  colnames(temp)[6:15] <- paste("topOfftarget", 1:10, "MMdistance2PAM", sep = "")
  temp <- as.data.frame(temp)
  temp$top5OfftargetTotalScore <- as.numeric(temp$top5OfftargetTotalScore)
  temp[, colnames(temp)[4]] <- as.numeric(temp[, colnames(temp)[4]])
  write.xlsx(temp, file = OfftargetSummary, rowNames = FALSE)
  
  # Annotate offTargets:
	if (annotateExon) {
		Offtargets <- annotateOffTargets(Offtargets, txdb, orgAnn, ignore.strand)
	}
  
  # Calculate gRNA efficiency:
  ## Calcualte for ontargets and offtargets depending on user selection:
  ontargets <- Offtargets[grepl("^\\.+$", Offtargets$alignment), ]
  ontargets <- unique(subset(ontargets, ontargets$n.mismatch == 0))
  targets <- data.frame()
  if (!calculategRNAefficacyForOfftargets && dim(ontargets)[1] > 0) {
    targets <- ontargets
  } else if (calculategRNAefficacyForOfftargets && dim(Offtargets)[1] > 0) {
    targets <- Offtargets
  }

  if (dim(targets)[1] > 0) {
    chr <- as.character(targets$chrom)
    strand <- as.character(targets$strand)
    if (PAM.location == "3prime") {
      Start <- ifelse(strand == "-",
                      as.numeric(as.character(targets$chromStart)) - baseAfterPAM,
                      as.numeric(as.character(targets$chromStart)) - baseBeforegRNA)
      End <- ifelse(strand == "-",
                    as.numeric(as.character(targets$chromEnd)) + as.numeric(baseBeforegRNA),
                    as.numeric(as.character(targets$chromEnd)) + as.numeric(baseAfterPAM))
    } else {
      Start <- ifelse(strand=="-", as.numeric(as.character(targets$chromStart)) - baseAfterPAM + gRNA.size, as.numeric(as.character(targets$chromStart)) - baseBeforegRNA + PAM.size)
      End <- ifelse(strand=="-", as.numeric(as.character(targets$chromEnd)) + as.numeric(baseBeforegRNA) - PAM.size, as.numeric(as.character(targets$chromEnd)) + as.numeric(baseAfterPAM) - gRNA.size)
    }
    
    starts <- unlist(apply(cbind(Start, 1), 1, max))
    if (is.null(genomeSeqFile)) {
      ends <- unlist(apply(cbind(End, seqlengths(BSgenomeName)[chr]), 1, min))
      extendedSequence <- getSeq(BSgenomeName, names = chr, start = starts, end = ends, strand = strand, width = NA, as.character = TRUE)
    } else {
      genomeSeq <- readDNAStringSet(genomeSeqFile)
      ends <- unlist(apply(cbind(End, width(genomeSeq)[names(genomeSeq) %in% chr]), 1, min))
      extended.info <- data.frame(chrom = chr, start = starts, end = ends, strand = strand)
      extendedSequence <- getSeq(genomeSeq, as(extended.info, "GRanges"))
    }
    
    if (rule.set == "Root_RuleSet1_2014") {
      gRNAefficiency <- calculategRNAEfficiency(extendedSequence, baseBeforegRNA = baseBeforegRNA, featureWeightMatrix = featureWeightMatrix)
    } else if (rule.set == "Root_RuleSet2_2016") {
      gRNAefficiency <- calculategRNAEfficiency2(extendedSequence)
    } else if (rule.set == "CRISPRscan") {
      gRNAefficiency <- calculategRNAEfficiencyCRISPRscan(extendedSequence, featureWeightMatrix = featureWeightMatrix)
    } else if (rule.set == "DeepCpf1") {
  	  gRNAefficiency <- round(deepCpf1(extendedSequence = extendedSequence, chrom_acc = chrom_acc), 3)
    }
    
    if (PAM.location == "3prime") {
      gRNAefficiency[nchar(extendedSequence) < baseBeforegRNA + gRNA.size + PAM.size + baseAfterPAM] <- "extended sequence too short"
    } else {
      gRNAefficiency[nchar(extendedSequence) < baseBeforegRNA + baseAfterPAM] <- "extended sequence too short"
    }
      
    if (!calculategRNAefficacyForOfftargets && dim(ontargets)[1] > 0) { 
      ontargets <- cbind(ontargets,  extendedSequence = extendedSequence, gRNAefficacy = gRNAefficiency)
      Offtargets <- merge(Offtargets, ontargets, all = TRUE)
    } else {
      Offtargets  <- cbind(Offtargets, extendedSequence = extendedSequence, gRNAefficacy = gRNAefficiency)
    }
  }
  
  # Fetch flankSequence:    
  if (fetchSequence) {
    strand <- as.character(Offtargets$strand)
    chr <- as.character(Offtargets$chrom)
    Start <- ifelse(strand == "-", as.numeric(as.character(Offtargets$chromStart)) - as.numeric(downstream), as.numeric(as.character(Offtargets$chromStart)) - as.numeric(upstream))
    End <- ifelse(strand == "-", as.numeric(as.character(Offtargets$chromEnd)) + as.numeric(upstream), as.numeric(as.character(Offtargets$chromEnd)) + as.numeric(downstream))
	  starts <- unlist(apply(cbind(Start,1), 1, max))
	   
	  if (is.null(genomeSeqFile)) {
	    ends <- unlist(apply(cbind(End, seqlengths(BSgenomeName)[chr]), 1,min))
	    seq <- getSeq(BSgenomeName, names = chr, start = starts, end = ends, strand = strand, width = NA, as.character = TRUE)
	  } else {
	    chrLen <- unlist(lapply(chr, function(i) width(genomeSeq)[names(genomeSeq) == i]))
	    ends <- unlist(apply(cbind(End, chrLen), 1, min))
	    extended.info <- data.frame(chrom = chr, start = starts, end = ends, strand = strand)            
	    seq <- getSeq(genomeSeq, as(extended.info, "GRanges")) 
	  }

	  Offtargets <- cbind(Offtargets, flankSequence = seq)
  }
  
  colnames(Offtargets)[colnames(Offtargets) == "NGG"] = "isCanonicalPAM"
  Offtargets$gRNAefficacy <- as.numeric(Offtargets$gRNAefficacy)
  Offtargets$isCanonicalPAM <- as.numeric(Offtargets$isCanonicalPAM)
  
  if ("inExon" %in% names(Offtargets)) {
    # need to convert "" into FALSE first:
    Offtargets$inExon <- sapply(Offtargets$inExon, function(x) ifelse(x == "", NA, x))
    Offtargets$inExon <- as.logical(Offtargets$inExon)
    Offtargets$inIntron <- sapply(Offtargets$inIntron, function(x) ifelse(x == "", NA, x))
    Offtargets$inIntron <- as.logical(Offtargets$inIntron)
    Offtargets$entrez_id <- as.numeric(Offtargets$entrez_id)
  }

  write.xlsx(Offtargets[order(as.character(Offtargets$name), 
                              -as.numeric(as.character(Offtargets$score)), 
                              as.character(Offtargets$OffTargetSequence)), ],
             file = OfftargetFile, rowNames = FALSE)
  list(offtargets = unique(Offtargets), summary = unique(temp))
}
