#' Prepare input data for peakCombiner package
#'
#' @description
#' [peakCombiner::prepareInputRegions] prepares the input data in the format
#' needed for all of the following steps within peakCombiner. It accepts the
#' following formats:
#'
#' * in memory data frame listing each sample's peak file location,
#' * in memory data frame listing the peaks themselves that are found in each
#'   sample, or
#' * in memory GRanges object listing the peaks themselves that are found in
#'   each sample.
#'
#' @details
#' Accepted inputs are one of the three following options:
#'
#' 1. In memory data frame listing each sample's peak file location
#'    * `sample_name` -  Unique name for each sample
#'                       (required).
#'    * `file_path` -   Path to the file in which the genomic regions are
#'                       stored. For example, the path to a bed file or
#'                       `.narrowPeak` file (required).
#'    * `file_format` -  The expected file format. Needed to correctly label the
#'                       columns of the input. Acceptable values are:
#'                       `bed`, `narrowPeak`, and `broadPeak` (required).
#'    * `score_colname` - Either column name or number of the column having the
#'                       the metric used to rank peak importance, where bigger
#'                       values are more important. Entries have to be identical,
#'                       mutliple entries are not supported. If not provided,
#'                       column 9 will be used for `.narrowPeak` or
#'                       `.broadPeak` file formats. Column 9 corresponds to
#'                       the `qValue` as described in the UCSC documentation
#'                  [here](https://genome.ucsc.edu/FAQ/FAQformat.html#format12).
#'                       Other alternatives for `narrowPeak` or `broadPeak`
#'                       could be columns 7 or 8, which correspond to
#'                       `signalValue` or `pValue` (optional).
#'
#' 2. In memory data frame listing the peaks themselves that are found in each
#'    sample. The columns can be provided in any order and have the following
#'    names. Note that additional columns will be dropped.
#'    * `chrom` - chromosome name (required).
#'    * `start` - start coordinate of range (1-based coordinate system,
#'                NOT like bed files which are 0-based) (required).
#'    * `end` -   end coordinate of range (required).
#'    * `sample_name` - unique identifier for a sample. No restrictions on
#'                  characters (required).
#'    * `score` - the metric used to rank peak importance, where bigger values
#'                  are more important. For example, qValue from Macs2,
#'                  -log10FDR from another method, or fold enrichment over
#'                  background computed from your favorite method. If not
#'                  provided, defaults to 0 (optional).
#'    * `strand` - values are '+', '-', or '.'. If not provided, defaults to '.'
#'                 (optional).
#'    * `summit` - distance of the strongest signal ("summit") of the peak
#'                 region from the start coordinate (optional).
#'
#' 3. In memory GRanges object listing the peaks themselves that are found in
#'    each sample. This object is very similar to the data frame above,
#'    except that `chrom`, `start`, and `end` are instead described using
#'    the `GRanges` nomenclature. Note that additional columns will be dropped.
#'
#' This function parses the inputs provided and returns a data frame having the
#' columns listed below.
#'
#' * `chrom` -        chromosome name
#' * `start` -        start coordinate of range (1-based coordinate system,
#'                    NOT like bed files which are 0-based)
#' * `end` -          end coordinate of range
#' * `name` -         unique identifier for a region. auto-generated by this
#'                    function
#' * `score` -        the metric used to rank peak importance, where bigger
#'                    values are more important. For example, qValue from MACS2,
#'                    -log10FDR from another method, or fold enrichment over
#'                    background computed from your favorite method
#' * `strand` -       values are '+', '-', or '.'. Chromatin data are typically
#'                    non- stranded so will have a '.'.
#' * `center` -       absolute genomic coordinate of the nucleotide at the
#'                    center of the peak region, or alternatively the strongest
#'                    signal ("summit") of the peak region. If no value is
#'                    provided by the user, `center` defaults to the arithmetic
#'                    center of the peak region.
#' * `sample_name` -  unique identifier for a sample. No restrictions on
#'                    characters
#'
#' In addition, input data is checked for mutliple entries of the same genomic
#' region. This can occure when using called peak files as multiple summits can
#' be annotated within the sqme genomic regions (defined by `chrom`, `start`
#' and `end`). To avoid mutliple entries, this script is checking the input for
#' multiple summits within the same regions and maintains only the strongest
#' enriched (based on the values in the column `score`). This step is mandatory
#' to guaranty an optimal result.
#' 
#' An additional option is to provide already here a genome (details see below) 
#' and maintain this information for the function 
#' [peakCombiner::centerExpandRegions()].
#'
#'
#' @param data  Data frame or GRanges object with the input data. Several
#' formats are accepted, which are described in full in the Details below.
#' * in memory data frame listing each sample's peak file location,
#' * in memory data frame listing the peaks themselves that are found in each
#'   sample, or
#' * in memory GRanges object listing the peaks themselves that are found in
#'   each sample.
#'
#' @param genome      Character value to define the matching genome reference to 
#'                      the input data. Default value is NA. Allows values are 
#'                      based on GenomicRanges supported genomes like "GRCh38", 
#'                      "GRCh38.p13", "Amel_HAv3.1", "WBcel235", "TAIR10.1", 
#'                      "hg38", "mm10", "rn6", "bosTau9", "canFam3", "musFur1", 
#'                      "galGal6","dm6", "ce11", and "sacCer3". Please see also
#'                      help for [Seqinfo::seqinfo()] for more details. 
#'
#' @param startsAreBased Either 0, 1 (Default), or NA. Define if the provided 
#'                        input data is 0 or 1-based. Only, if paramter is NA 
#'                        then GenomicRanges object, tibbles and dataframes are
#'                        considered 1-based, while data loaded from a
#'                        sample_sheet is considered 0-based (expected to load a
#'                        BED file). 
#'
#'
#' @param outputFormat Character value to define format of output object. 
#'                      Accepted values are "GenomicRanges" (default), "tibble" 
#'                      or "data.frame".  
#'
#' @param showMessages Logical value of TRUE (default) or FALSE. Defines if
#'                      info messages are displayed or not.
#'
#' @return A tibble with the columns `chrom`, `start`, `end`, `name`, `score`,
#' `strand`, `center`, `sample_name`. The definitions of these columns are
#' described in full in the Details below. Use as input for functions
#' [peakCombiner::centerExpandRegions()], [peakCombiner::filterRegions()] and
#' [peakCombiner::combineRegions()].
#'
#' @export
#'
#' @importFrom rlang .data
#' @import tidyr
#' @import here
#'
#'
#' @examples
#' # Load in and prepare a an accepted tibble
#' utils::data(syn_data_tibble)
#'
#' data_prepared <- prepareInputRegions(
#'   data = syn_data_tibble,
#'   outputFormat = "tibble",
#'   showMessages = TRUE
#' )
#' data_prepared
#'
#' # Or a pre-loaded tibble with genomic regions and named columns.
#'
#' utils::data(syn_data_control01)
#' utils::data(syn_data_treatment01)
#'
#' combined_input <- syn_data_control01 |>
#'   dplyr::mutate(sample_name = "control-rep1") |>
#'   rbind(syn_data_treatment01 |>
#'     dplyr::mutate(sample_name = "treatment-rep1"))
#'
#' prepareInputRegions(
#'   data = combined_input,
#'   outputFormat = "tibble",
#'   showMessages = FALSE
#' )
#'
prepareInputRegions <- function(
    data, 
    outputFormat = "GenomicRanges",
    genome = NA, 
    startsAreBased = 1, 
    showMessages = TRUE
    ) {
  ### -----------------------------------------------------------------------###
  ### Show or hide messages
  ### -----------------------------------------------------------------------###
  
  if (!is.logical(showMessages)) {
    cli::cli_abort(c(
      "x" = "Argument {.arg showMessages} has to be {.cls logical}."
    ))
  } else if (isTRUE(showMessages)) {
    options("rlib_message_verbosity" = "default")
  } else if (isFALSE(showMessages)) {
    options("rlib_message_verbosity" = "quiet")
  } else {
    # show error message independent of parameter showMessages
    options("rlib_message_verbosity" = "default")
    
    cli::cli_abort(c(
      "x" = "Argument {.arg showMessages} is a non-accepted {.cls logical}
      value.",
      "i" = "Argument {.arg showMessages} is {.val {showMessages}}."
    ))
  }
  
  ### -----------------------------------------------------------------------###
  ### Define variables
  ### -----------------------------------------------------------------------###

  required_samplesheet_colnames <- c(
    "sample_name", "file_path", "file_format"
  )

  output_colnames <- c(
    "chrom", "start", "end", "name", "score", "strand", "center",
    "sample_name"
  )

  required_colnames <- c(
    "chrom", "start", "end", "sample_name"
  )


  if (any(c("seqnames") %in% colnames(data))) {
    data |> dplyr::rename(chrom = .data$seqnames)
  }

  if (any(c("chr") %in% colnames(data))) {
    data |> dplyr::rename(chrom = .data$seqnames)
  }
  ### -----------------------------------------------------------------------###
  ### Check if required input parameters were provided
  ### -----------------------------------------------------------------------###

  if (!exists("data")) {
    cli::cli_abort(c(
      "x" = "Argument {.arg data} does not exist."
    ))
  }
  
  ### -----------------------------------------------------------------------###
  ### Check if GenomicRanges object contains only one genome
  ### -----------------------------------------------------------------------###
  
  if (inherits(data, "GRanges")) {
    cli::cli_inform(c(
      "i" = "Input data {.arg data} is a class {.cls GRanges}."
    ))    
    
    input_file_genome <- Seqinfo::genome(data) |> unique()
    
    if (length(input_file_genome) > 1) {
      cli::cli_abort(c(
        "i" = "Input data {.arg data} is a class {.cls GRanges}.",
        "x" = "Input data {.arg data} has multiple assigned genomes.
        Input data has to have be from the same genome.",
        "i" = "Values of assigned genomes are: {.val {input_file_genome}}."
      ))
    }
    cli::cli_inform(c(
      "i" = "Input data {.arg data} assigned genomes is 
      {.val {input_file_genome}}."
    ))  
    
  }
  
  ### -----------------------------------------------------------------------###
  ### Check if output format is valid
  ### -----------------------------------------------------------------------###
  
  if (outputFormat %in% c("GenomicRanges", 
                           "GRanges", 
                           "tibble", 
                           "data.frame", 
                           "data.table")) {
    cli::cli_inform(c(
      "i" = "Argument {.arg outputFormat} is set to {.val {outputFormat}}."
    ))
  } else {
     cli::cli_abort(c(
      "x" = "Argument {.arg outputFormat} has to be one of the following
      values: {.val GenomicRanges}, {.val tibble}, or {.val data.frame}.",
      "i" = "Provided value is {.val {outputFormat}}."
    ))
  }
  
  ### -----------------------------------------------------------------------###
  ### Check if input is 0 or 1 based
  ### -----------------------------------------------------------------------###
  
  if(!is.na(startsAreBased) & !is.numeric(startsAreBased)) {
    cli::cli_abort(c(
      "x" = "Argument {.arg startsAreBased} has to be either numeric {.val 0}, 
      numeric {.num 1} or {.val NA}}.",
      "i" = "Argument {.arg startsAreBased} is {.val {startsAreBased}}."
    ))
  }
  
  if(is.na(startsAreBased)) {
    cli::cli_inform(c(
      "i" = "Argument {.arg startsAreBased} is {.val {startsAreBased}}."
    ))
    
    if (all(required_samplesheet_colnames %in% colnames(data))) {
      cli::cli_inform(c(
        "i" = "Provide input {.arg data} is a {.cls data.frame} with three or four
      columns and paths to existing files.",
        "i" = "Data is considered to be 0-based."
      ))
      starts.in.df.are.0based <- TRUE
      
    } else {
      cli::cli_inform(c(
        "i" = "Provide input {.arg data} is a {.cls GRanges}, 
        {.cls data.frame}, or {.cls tibble}.",
        "i" = "Data is considered to be 1-based."
      ))
      starts.in.df.are.0based <- FALSE
      
    }
    
  } else if (startsAreBased %in% c(0,1)) {
    cli::cli_inform(c(
      "i" = "Argument {.arg startsAreBased} is {.val {startsAreBased}}."
    ))
    
    if (startsAreBased == 1) {
      starts.in.df.are.0based <- FALSE
      cli::cli_inform(c(
        "i" = "Argument {.arg starts.in.df.are.0based} is 
        {.val {starts.in.df.are.0based}}."
      ))
    } else if (startsAreBased == 0) {
      starts.in.df.are.0based <- TRUE
      cli::cli_inform(c(
        "i" = "Argument {.arg starts.in.df.are.0based} is 
        {.val {starts.in.df.are.0based}}."
      ))
    }
    
  } else {
    cli::cli_abort(c(
      "x" = "Argument {.arg startsAreBased} has to be either {.val 0}, {.val 1} 
      or {.val NA}}.",
      "i" = "Argument {.arg startsAreBased} is {.val {startsAreBased}}."
    ))
  }
  
  ### -----------------------------------------------------------------------###
  ### Figure out what kind of input data was entered by the user and
  ### load the initial data for follow-up quality checks
  ### -----------------------------------------------------------------------###

  if (inherits(data, "GRanges")) {
    cli::cli_inform(c(
      "!" = "Provided input {.arg data} is a class {.cls GRanges} and will be
      converted to class {.cls tibble}.",
      ">" = "Start converting and preparing data."
    ))
    
    input_seqinfo <- Seqinfo::seqinfo(data)
    
    data_prepared <-
      tibble::as_tibble(data) |>
      dplyr::rename(chrom = .data$seqnames) |>
      dplyr::mutate(
        start = as.numeric(.data$start),
        end = as.numeric(.data$end),
        strand = as.character(.data$strand)
      ) |>
      dplyr::mutate(strand = ifelse(.data$strand == "*", ".", .data$strand))
  } else if (all(required_samplesheet_colnames %in% colnames(data))) {
    cli::cli_inform(c(
      "i" = "Provide input {.arg data} is a {.cls data.frame} with three or four
      columns and paths to existing files.",
      ">" = "Start loading and preparing data."
    ))

    data_prepared <-
      loadInputRegions(
        data = data
      )
  } else if (all(required_colnames %in% colnames(data))) {
    
    #if (!is.na(startsAreBased) | startsAreBased == 2) {
    #  cli::cli_inform(c(
    #    "i" = "Provide input {.arg data} is a tibble and the parameter 
    #    {.arg startsAreBased} was set to {.var 0} by user.",
    #    ">" = "Converting input to 1-based."
    #  ))
    #  data_prepared <- data |>
    #    dplyr::mutate(start = start + 1)
    #  
    #} else {
      data_prepared <- data
    #}
    
    cli::cli_inform(c(
      "i" = "Provide input {.arg data} is a pre-loaded {.cls data.frame}  with
      the required column names.",
      ">" = "Start preparing data."
    ))
  } else {
    
    cli::cli_abort(c(
      "x" = "Provide input {.arg data} does not have the required format.",
      "!" = "Please check your column names in {.arg data}."
    ))
  }

  ### -----------------------------------------------------------------------###
  ### Quality control of input data. Populate optional (aka missing) columns
  ### with default values.
  ### -----------------------------------------------------------------------###

  cli::cli_inform(c(
    "i" = "Required columns will be added if missing."
  ))

  ## Check and update name
  if (any(c("name") %in% colnames(data_prepared))) {
    cli::cli_inform(c(
      "!" = "Column {.field 'name'} from {.arg data} will be
      overwritten.",
      ">" = "Column {.field 'name'} is a computed column from
      {.pkg peakCombiner} and
      therefore pre-exisiting data in a column {.field 'name'} will not
      be retained."
    ))
  }

  # Add col name from sample_name and place it correctly
  data_prepared <-
    data_prepared |>
    dplyr::group_by(.data$sample_name) |>
    dplyr::mutate(
      name = paste(.data$sample_name, dplyr::row_number(), sep = "|")
    ) |>
    dplyr::ungroup()

  if (!is.character(data_prepared$name)) {
    cli::cli_inform(c(
      "!" = "Column {.field 'name'} is not of class {.cls character}.",
      ">" = "It will be converted to class {.cls character}."
    ))
  }

  ### -----------------------------------------------------------------------###
  ## If score column does not exists, create it
  if (!any(c("score") %in% colnames(data_prepared))) {
    cli::cli_inform(c(
      "!" = "Column {.field 'score'} does not exist in {.arg data_prepared}.",
      ">" = "Column {.field 'score'} is added and filled with {.val 0}."
    ))

    data_prepared <- data_prepared |>
      dplyr::mutate(score = 0)
  }

  ### -----------------------------------------------------------------------###
  ## If strand column does not exists, creat it
  if (!any(c("strand") %in% colnames(data_prepared))) {
    cli::cli_inform(c(
      "!" = "Column {.field 'strand'} does not exist in {.arg data_prepared}.",
      ">" = "Column {.field 'strand'} is added and filled with {.val .}."
    ))

    data_prepared <-
      data_prepared |>
      dplyr::mutate(strand = ".")
  }

  ### -----------------------------------------------------------------------###
  ## Update center column from summit column

  if (any("center" %in% colnames(data_prepared))) {
    cli::cli_inform(c(
      "!" = "Column {.field 'center'} does exist in
      {.arg data_prepared}.",
      ">" = "The column {.field 'center'} is taken to define output column
      {.field 'center'}."
    ))
  } else if (!any(c("summit") %in% colnames(data_prepared))) {
    cli::cli_inform(c(
      "!" = "Column {.field 'summit'} does not exist in
      {.arg data_prepared}.",
      ">" = "As no input column {.field 'summit'} is found, the output column
      {.field 'center'} has to be filled with arithmetic center of peak."
    ))

    data_prepared <-
      data_prepared |>
      dplyr::mutate(center = .data$start + (round(.data$end - .data$start) / 2))
  } else if (any(c("summit") %in% colnames(data_prepared))) {
    if (!is.numeric(data_prepared$summit)) {
      cli::cli_alert(c(
        "x" = "Column {.field 'summit'} is not class {.cls numeric}. It will be
        converted to class {.cls numeric}."
      ))

      data_prepared <-
        data_prepared |>
        dplyr::mutate(summit = as.numeric(.data$summit))
    }

    ## Check for wrong summit annotations.
    ## summit has to be in relative distances.
    n_bad_summits <-
      data_prepared |>
      dplyr::filter(.data$summit > (.data$end - .data$start)) |>
      dplyr::count() |>
      dplyr::pull()

    if (n_bad_summits > 0) {
      cli::cli_abort(c(
        "x" = "Column {.field 'summit'} exceeds the calculated peak width
        ({.field 'end'} - {.field 'start'}). Input values have to be smaller
        then width.",
        ">" = "Is your {.field 'summit'} value reported as absolute genomic
        position rather than relative to the start coordinate, as expected?",
        "i" = "Please doublecheck the data stored in the input column
        {.field 'summit'}."
      ))
    }

    remove(n_bad_summits)

    # Summits look good so let's create our center

    data_prepared <-
      data_prepared |>
      dplyr::mutate(center = .data$start + .data$summit) |>
      dplyr::relocate("center", .before = "summit")
  }

  ### -----------------------------------------------------------------------###
  ## Select required column
  data_prepared <-
    data_prepared |>
    dplyr::select(tidyselect::all_of(output_colnames))

  ### -----------------------------------------------------------------------###
  ## Check for NAs in data - REPLACE WITH FUNCTION; MOVE AFTER THE MODIFING

  data_prepared <- checkDataStructure(
    data = data_prepared,
    showMessages = showMessages
  )
  ### -----------------------------------------------------------------------###
  ### Show or hide messages
  ### -----------------------------------------------------------------------###
  
  if (!is.logical(showMessages)) {
    cli::cli_abort(c(
      "x" = "Argument {.arg showMessages} has to be {.cls logical}."
    ))
  } else if (isTRUE(showMessages)) {
    options("rlib_message_verbosity" = "default")
  } else if (isFALSE(showMessages)) {
    options("rlib_message_verbosity" = "quiet")
  } else {
    # show error message independent of parameter showMessages
    options("rlib_message_verbosity" = "default")
    
    cli::cli_abort(c(
      "x" = "Argument {.arg showMessages} is a non-accepted {.cls logical}
      value.",
      "i" = "Argument {.arg showMessages} is {.val {showMessages}}."
    ))
  }
  
  ### -----------------------------------------------------------------------###
  ### Collapse duplicated regions within each sample to unique coordinates
  ### -----------------------------------------------------------------------###

  data_prepared <- collapseSummits(
    data = data_prepared
  ) |> dplyr::arrange(.data$sample_name, .data$chrom, .data$start, .data$end)

  ### -----------------------------------------------------------------------###
  ### Return prepared input data
  ### -----------------------------------------------------------------------###

  cli::cli_inform(c(
    "v" = "Preparation of data finished successfully."
  ))

  ### -----------------------------------------------------------------------###
  ### Adjust output format
  ### -----------------------------------------------------------------------###
  
  if (outputFormat %in% c("GenomicRanges", "GRanges")) {
    if(exists("input_seqinfo")) {
      cli::cli_inform(c(
        "i" = "Output format is set to {.val {outputFormat}}.",
        "i" = "Assigning input genome annotation to ouutput. ")
      )
      
      data_prepared <- 
        data_prepared |>
        GenomicRanges::makeGRangesFromDataFrame(
          keep.extra.columns = TRUE,
          seqinfo = input_seqinfo,
          starts.in.df.are.0based = starts.in.df.are.0based
        )
    } else{
      cli::cli_inform(c(
        "i" = "Output format is set to {.val {outputFormat}}.",
        "i" = "No input genome annotation assigned to ouutput. ")
      )
      data_prepared <- 
        data_prepared |>
        GenomicRanges::makeGRangesFromDataFrame(
          keep.extra.columns = TRUE,
          starts.in.df.are.0based = starts.in.df.are.0based)
    }
        
  } else if (outputFormat %in% c("tibble", "data.frame", "data.table")) {
    cli::cli_inform(c(
      "i" = "Output format is set to {.val tibble}."
    ))
    
    if (inherits(data, "GRanges")) {
      data_prepared <- data_prepared
    } else if (inherits(data, "tbl_df") || inherits(data, "data.frame")) {
      
      if (startsAreBased %in% c(1)) {
        data_prepared <- data_prepared
        
      } else if (startsAreBased %in% c(0, NA)) {
        data_prepared <- data_prepared |>
          dplyr::mutate(start = .data$start + 1)
        }
      
    }
 
    } else {
   
    cli::cli_abort(c(
      "x" = "Argument {.arg outputFormat} has to be one of the following
      values: {.val GenomicRanges}, {.val tibble}, or {.val data.frame}.",
      "i" = "Provided value is {.val {outputFormat}}."
    ))
  } 
  
  ### -----------------------------------------------------------------------###
  ### Set message display back to default
  ### -----------------------------------------------------------------------###
  
  if (isFALSE(showMessages)) {
    options("rlib_message_verbosity" = "default")
  }
  
  return(data_prepared)
}
