#' Summarise the distribution of sRNA clusters across genomic features
#'
#' @description Calculates the number of genomic features within the supplied 
#' annotations and calculates the number of sRNA clusters which overlap with 
#' these genomic features. Features include promoter regions, repeat regions, 
#' exons, introns, and untranslated regions. This can be summarised as the 
#' absolute or relative values. 
#'
#'
#' @details
#' `RNAfeatures` calculates the number or percentage of sRNA clusters which 
#' overlap with genomic features based on their genomic coordinates. 
#'
#' 
#' @seealso [mobileRNA::RNAmergeAnnotations()] to merge 2 GFF files into 1. 
#'
#' @param data data.frame; generated by [mobileRNA::RNAimport()]
#'
#' @param annotation path; URL or connection to a GFFFile object. A genome
#' reference annotation file (.gff/.gff1/.gff2/.gff3). Can be in compressed 
#' format (gzip).
#'
#' @param repeats path; URL or connection to a GFFFile object. A genome
#' reference annotation file, which only contains information on repeat
#' sequences in the genome (.gff/.gff1/.gff2/.gff3). By default, this is not
#' required, however if there is a specific repeats annotation file for the
#' genome it is suggested to supply it. Can be in compressed format (gzip).
#' 
#' @param promoterRegions numeric; defines the upstream promoter region of 
#' genes. Default is 1000, which refers to promoters set at 1Kb upstream of
#' genes
#' 
#' @param percentage logical; define whether to return the results as a 
#' percentage of the total or returned as a count value representing the 
#' number of sRNA clusters that overlap with a given genomic feature. Default is
#' `TRUE`. 
#'
#'@param repeat.type character; features type in `annotation` file to represent
#'repeats or transposable elements when `repeats` not supplied. Default is 
#'`c("transposable_element", transposable_element_gene")` which represent the 
#'transposable element features in the TAIR10 genome annotation. 
#'
#' @return Returns a table containing the number or percentage of overlaps in
#' the supplied sRNA data set with specific regions in the genome annotation 
#' such as genes, repeats, introns, exons.
#'
#' @examples
#' data("sRNA_data")
#' features <- RNAfeatures(data = sRNA_data,
#'                        annotation = system.file("extdata",
#'                        "reduced_chr2_Tomato.gff.gz", package="mobileRNA"))
#'                        
#'@importFrom rtracklayer import
#'@importFrom GenomicRanges setdiff
#'@importFrom GenomicRanges findOverlaps
#'@importFrom S4Vectors queryHits
#'@importFrom stats start
#'@importFrom stats end
#'@importFrom BiocGenerics strand
#'@importFrom dplyr select
#'@importFrom dplyr mutate
#'@importFrom dplyr filter
#'@importFrom GenomicRanges makeGRangesFromDataFrame
#'@importFrom scales label_percent
#'@importFrom dplyr %>%
#'@importFrom IRanges overlapsAny
#'@export
RNAfeatures <- function(data, annotation,
                        repeats = NULL,
                        promoterRegions = 1000,
                        percentage = TRUE, 
                        repeat.type = NULL) {
  if (base::missing(data)) {
    stop("data is missing. data must be an object of class matrix, data.frame, 
         DataFrame. ")
  }
  if (!base::inherits(data, c("matrix", "data.frame", "DataFrame"))) {
    stop("data must be an object of class matrix, data.frame, DataFrame.")
  }
  if (missing(annotation) || is.null(annotation) || annotation == "" || 
      !file.exists(annotation)) {
    stop("annotation parameter is missing or empty.")
  }
  annotation_info <-rtracklayer::import(annotation)
  
  if(is.null(repeats)) {
    # features
    if(is.null(repeat.type)){
      repeat.type <- c("transposable_element", 
                       "transposable_element_gene")
    }
    repeats <-subset(annotation_info, type==repeat.type)
    } else {
      repeats <- rtracklayer::import(repeats)
    }
  
  if(length(repeats) == 0 ){
    
    genes<-subset(annotation_info, type=="gene")
    five_UTR<-subset(annotation_info,type=="five_prime_UTR")
    three_UTR<-subset(annotation_info, type=="three_prime_UTR")
    three_UTR<-GenomicRanges::setdiff(three_UTR, c(five_UTR), 
                                      ignore.strand=TRUE)
    
    exons<-subset(annotation_info[IRanges::overlapsAny(annotation_info,
                                                       genes,
                                                       ignore.strand=TRUE)],
                  type=="exon")
    exons<-GenomicRanges::setdiff(exons, c(five_UTR, three_UTR),
                                  ignore.strand=TRUE)
    
    introns <- GenomicRanges::setdiff(genes, c(exons, five_UTR, three_UTR),
                                      ignore.strand=TRUE)
    
    # define promoter regions
    gene_promoters <-as.data.frame(genes)
    colnames(gene_promoters)[1] <- "chr"
    if('*' %in% gene_promoters$strand){
      gene_promoters <- gene_promoters[, -match("strand", 
                                                colnames(gene_promoters))]
    }
    pos_strand_promoter <- gene_promoters %>%
      dplyr::filter(strand == "+") %>% 
      dplyr::mutate(end=start) %>%
      dplyr::mutate(start=start-promoterRegions)
    
    neg_strand_promoter <- gene_promoters %>%
      dplyr::filter(strand == "-") %>% 
      dplyr::mutate(end=start) %>%
      dplyr::mutate(start=start-promoterRegions)
    
    promoters <- rbind(pos_strand_promoter, neg_strand_promoter)
    promoters <- GenomicRanges::makeGRangesFromDataFrame(promoters)
    promoters <-   BiocGenerics::setdiff(promoters, c( exons,five_UTR,
                                                      three_UTR,introns),
                                         ignore.strand=TRUE)
    #get others
    others <-   BiocGenerics::setdiff(subset(annotation_info, type=="chromosome"),
                                      c( exons,five_UTR,three_UTR,introns,
                                        promoters), ignore.strand=TRUE)
    
  } else {
    genes<-subset(annotation_info[!IRanges::overlapsAny(annotation_info,
                                                        repeats,
                                                        ignore.strand=TRUE)],
                  type=="gene")
    five_UTR<-subset(annotation_info[IRanges::overlapsAny(annotation_info,
                                                          genes,
                                                          ignore.strand=TRUE)],
                     type=="five_prime_UTR")
    five_UTR<-GenomicRanges::setdiff(five_UTR, c(repeats), ignore.strand=TRUE)
    three_UTR<-subset(annotation_info[IRanges::overlapsAny(annotation_info,
                                                           genes,
                                                           ignore.strand=TRUE)],
                      type=="three_prime_UTR")
    three_UTR<-GenomicRanges::setdiff(three_UTR, c(five_UTR, repeats),
                                      ignore.strand=TRUE)
    
    exons<-subset(annotation_info[IRanges::overlapsAny(annotation_info,
                                                       genes,
                                                       ignore.strand=TRUE)],
                  type=="exon")
    exons<-GenomicRanges::setdiff(exons, c(five_UTR, three_UTR, repeats),
                                  ignore.strand=TRUE)
    
    introns <- GenomicRanges::setdiff(genes, c(exons, five_UTR, three_UTR),
                                      ignore.strand=TRUE)
    
    # define promoter regions
    gene_promoters <-as.data.frame(genes)
    colnames(gene_promoters)[1] <- "chr"
    if('*' %in% gene_promoters$strand){
      gene_promoters <- gene_promoters[, -match("strand", 
                                                colnames(gene_promoters))]
    }
    pos_strand_promoter <- gene_promoters %>%
      dplyr::filter(strand == "+") %>% dplyr::mutate(end=start) %>%
      dplyr::filter(strand == "+") %>% dplyr::mutate(start=start-
                                                       promoterRegions)
    
    neg_strand_promoter <- gene_promoters %>%
      dplyr::filter(strand == "-") %>% dplyr::mutate(end=start) %>%
      dplyr::filter(strand == "-") %>% dplyr::mutate(start=start-
                                                       promoterRegions)
    
    promoters <- rbind(pos_strand_promoter, neg_strand_promoter)
    promoters <- GenomicRanges::makeGRangesFromDataFrame(promoters)
    promoters <-   BiocGenerics::setdiff(promoters, c(repeats, exons,five_UTR,
                                                      three_UTR,introns),
                                         ignore.strand=TRUE)
    #get others
    others <-   BiocGenerics::setdiff(subset(annotation_info, 
                                             type=="chromosome"),
                                      c(repeats, exons,five_UTR,
                                        three_UTR,introns,
                                        promoters), ignore.strand=TRUE)
  }
  
  # data frame
  sRNA_features_df <- matrix(0, ncol=7, nrow = 2)
  colnames(sRNA_features_df) <- c("promoters","exons", "introns", "5'UTR",
                                  "3'UTR", "repeats", "others")
  rownames(sRNA_features_df) <- c("Genome", "Dataset")
  
    # genome
    sRNA_features_df[1,1] <- length(promoters)
    sRNA_features_df[1,2] <- length(exons)
    sRNA_features_df[1,3] <- length(introns)
    sRNA_features_df[1,4] <- length(five_UTR)
    sRNA_features_df[1,5] <- length(three_UTR)
    sRNA_features_df[1,6] <- length(repeats)
    sRNA_features_df[1,7] <- length(others)
   
    overlapFUN <- function(x,y){
      overlaps <- suppressWarnings(
        GenomicRanges::findOverlaps(x, y,ignore.strand=TRUE))
      
      # Extract unique hits
      unique_overlaps <- unique(S4Vectors::queryHits(overlaps))
      
      # Number of ranges in y that overlap with a range in x
      num_overlapping_ranges <- length(unique_overlaps)
      
      return(num_overlapping_ranges)
    }
    
    # select sample
    sRNA_df <-  data %>% dplyr::select(chr, start, end)
    sRNA_df <-  GenomicRanges::makeGRangesFromDataFrame(sRNA_df)
    
    
    sRNA_features_df[2,1] <- overlapFUN(promoters,sRNA_df)
    sRNA_features_df[2,2] <- overlapFUN(exons,sRNA_df)
    sRNA_features_df[2,3] <- overlapFUN(introns,sRNA_df)
    sRNA_features_df[2,4] <- overlapFUN(five_UTR,sRNA_df)
    sRNA_features_df[2,5] <- overlapFUN(three_UTR,sRNA_df)
    sRNA_features_df[2,6] <- overlapFUN(repeats,sRNA_df)
    sRNA_features_df[2,7] <- overlapFUN(others,sRNA_df)

  
  if(percentage == TRUE){
    # convert to percentage
    sRNA_features_df <- data.frame(t(sRNA_features_df)) %>%
      dplyr::mutate(Genome = scales::label_percent()(Genome / sum(Genome)))%>%
      dplyr::mutate(Dataset = scales::label_percent()(Dataset / sum(Dataset)))
    # if NA, conevrt to 0 assuming all columsn are character. 
    sRNA_features_df[] <- lapply(sRNA_features_df, function(x) 
      ifelse(is.na(x), "0%", x))
    
    return(sRNA_features_df)
    
  } else
  return(t(sRNA_features_df))
    
}

