#' Generate synthetic sequence reads
#'
#' This function generates synthetic sequence reads to mimic RNA-seq reads
#' sequenced from organelles or organisms with circular genome sequences
#' in FASTQ format file.
#'
#' @param n Number of reads should be generated.
#' @param seq A file path to a genome sequence in FASTA format file or
#'            a string of genome sequence.
#' @param output A file path to store the synthetic reads in FASTQ format file.
#'     The extension should be one of `.fq`, `.fastq`.
#'     Note that to compress the FASTQ format file, add `.gz` or `.gzip` to the
#'     extension (e.g., `.fq.gz`, `.fq.gzip`).
#' @param adapter A path to a FASTA format file containing a string of adapter
#'     sequence.
#'     If `NULL` is specified, the sequence "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"
#'     is used as the adapter sequence.
#'     If `NA` is specified, the adapter sequence is not included in the
#'     synthetic reads.
#' @param srna_length A data frame to specify the lengths of sequence reads
#'     sampled from the genome sequence.
#'     The data frame should contain two columns named as `length` and `prob`.
#'     The values in the `length` column is used to specify
#'     the lengths of sequence reads;
#'     the values in the `prob` column is used to specify the probability
#'     that reads with specified length among all reads.
#'     If the argument is not given (i.e., `srna_length = NULL`),
#'     a data frame is randomly generated before sampling the reads.
#' @param read_length The length of synthetic reads.
#'      If `adapter` is specified, the reads are generated by concatenating
#'      sequence reads and adapter sequences until the specified length.
#'      If `adapter = None`, ignore this argument.
#' @param mismatch_prob A vector to specify probabilities
#'     of mismatches occurring in the reads.
#'     In order not to allow any mismatches in the reads,
#'     set the argument to `0`.
#'     To allow multiple mismatches in the reads, set multiple probabilities
#'     (e.g., `c(0.05, 0.01)`).
#' @param peaks A data frame to specify the peaks of the alignment coverage.
#'     The data frame should contain four columns named as `mean`, `std`,
#'     `strand`, and `prob`.
#'     The values in the `mean` and `std` columns are used to sample the start
#'     position of sequence reads from the genome sequence given by `seq`.
#'     The values in the `strand` column should be `+` or `-` to specify
#'     which read strand generates the peak.
#'     The values in the `prob` column should be probabilities to use the
#'     `mean`, `std`, and `strand` of the same row for read generation.
#'     If the argument is not given (i.e., `peaks = NULL`),
#'     a data frame is randomly generated before sampling the reads.
#' @param read_name_prefix The prefix of read name in FASTQ format file.
#'     If `NULL`, generate the prefix randomly.
#' @return A \code{\link{CircSeqAlignTkSim-class}} object containing parameters
#'     for read generation.
#' @seealso \code{\link{CircSeqAlignTkSim-class}}
#' @examples
#' output_dpath <- tempdir()
#'
#' sim <- generate_reads(output = file.path(output_dpath, 'sample1.fq.gz'))
#'
#' srna_length <- data.frame(length = c(21, 22, 23, 24),
#'                           prob = c(0.5, 0.3, 0.1, 0.1))
#' sim <- generate_reads(output = file.path(output_dpath, 'sample2.fq.gz'),
#'                       srna_length = srna_length)
#'
#'
#' sim <- generate_reads(output = file.path(output_dpath, 'sample3.fq.gz'),
#'                       mismatch_prob = c(0.1, 0.1))
#'
#'
#' peaks <- data.frame(mean =   c( 50, 100, 150),
#'                     std =    c(  3,   5,   5),
#'                     strand = c('+', '-', '+'),
#'                     prob =   c(0.4, 0.4, 0.2))
#' sim <- generate_reads(output = file.path(output_dpath, 'sample4.fq.gz'),
#'                       peaks = peaks)
#' @importFrom stats rbinom runif
#' @importFrom Biostrings readDNAStringSet reverseComplement DNAStringSet
#' @export
generate_reads <- function(n = 1e4, seq = NULL, output = NULL, adapter = NULL,
                           srna_length =NULL, read_length = 150,
                           mismatch_prob = 0,
                           peaks = NULL, read_name_prefix = NULL) {
    if (is.null(output)) stop('Specify a file path to save FASTQ.')

    message('Initialize parameters for generating synthetic reads')
    seq <- .syn_set_seq(seq)
    adapter <- .syn_set_adapter(adapter)
    srna_length <- .syn_set_srna_length(srna_length)
    peaks <- .syn_set_peaks(peaks, seq)

    message('Started generating reads ...')
    seq_length <- nchar(seq)
    seq_ext <- paste0(seq, seq, seq)
    peaks$mean <- peaks$mean + seq_length
    # sampling start position
    read_seeds <- peaks[sample(seq(1, nrow(peaks)), n, replace = TRUE,
                               prob = peaks$prob), ]
    read_seeds$start <- apply(read_seeds[, c(1, 2)], 1, function(x) {
        b_size <- (x[1] * x[1]) / (x[1] - x[2])
        nn <- rbinom(1, size = round(b_size), prob = x[1] / b_size)})
    read_seeds$length <- sample(srna_length$length, n, replace = TRUE,
                                prob = srna_length$prob)
    # calculate start and end position considering strand and circular sequence
    srna_pos <- .syn_calc_position(read_seeds, seq_length, NULL, FALSE)
    # sampling sequence reads
    srna <- substring(seq_ext, srna_pos$start, srna_pos$end)
    srna <- .syn_introduce_mismatches(srna, mismatch_prob)
    srna[srna_pos$strand == '-'] <- as.character(
        reverseComplement(DNAStringSet(srna[srna_pos$strand == '-'])))
    # calculate coverage
    alncov <- .syn_calc_coverage(srna_pos, seq_length)
    # restore the start and end position (to fit the linear sequence)
    srna_pos <- .syn_calc_position(read_seeds, seq_length, srna_pos, FALSE)
    read_seeds <- .syn_arrange_readseeds(read_seeds, seq_length, srna_pos, srna)
    peaks$mean <- peaks$mean - seq_length
    # add adapter sequences
    reads <- .syn_add_adapter(read_seeds$sRNA, adapter, read_length)

    message('Saving reads into FASTQ file and finilize the process ...')
    .syn_save_reads(reads, output, read_name_prefix)
    new('CircSeqAlignTkSim',
        seq = seq, adapter = adapter, read_info = read_seeds, peak = peaks,
        coverage = alncov, fastq = output)
}


.syn_set_seq <- function(seq) {
    if (is.null(seq)) seq <- system.file(package="CircSeqAlignTk",
                                         "extdata", "FR851463.fa")
    if (file.exists(seq)) seq <- read_single_fasta(seq)$seq
    seq
}


#' @importFrom methods is
.syn_set_adapter <- function(adapter) {
    if (is.null(adapter)) {
        adapter <- 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'
    } else if (is.na(adapter)) {
        adapter <- ''
    } else if (file.exists(adapter)) {
        adapter <- read_single_fasta(adapter)$seq
    } else if (is(adapter, 'character')) {
        adapter <- adapter
    } else {
        stop('File is not found or the sequence type is unknown.')
    }
    adapter
}


.syn_set_peaks <- function(peaks, seq, n_peaks = 8) {
    if (is.null(peaks)) {
        peaks <- data.frame(
            mean = round(runif(n_peaks, 1, nchar(seq))),
            std = round(runif(n_peaks, 1, 5)),
            strand = sample(c('+', '-'), n_peaks, replace = TRUE),
            prob = runif(n_peaks)
        )
    }
    # check peaks info
    if (!all(colnames(peaks) %in% c('mean', 'std', 'strand', 'prob'))) {
        stop('The `peaks` should contain four columns which are named as',
             '`mean`, `std`, `strand`, and `prob`.')
    }
    peaks$prob <- peaks$prob / sum(peaks$prob)
    peaks
}


.syn_set_srna_length <- function(srna_length) {
    if (is.null(srna_length)) {
        srna_length <- data.frame(
            length = c(21, 22, 23, 24),
            prob = runif(4)
        )
    }
    # check seeds
    if (!all(colnames(srna_length) %in% c('length', 'prob'))) {
        stop('The `srna_length` should contain two columns which are named as',
             '`length` and `prob`.')
    }
    srna_length$prob <- srna_length$prob / sum(srna_length$prob)
    srna_length
}


.syn_calc_position <- function(read_seeds, seq_length, pos = NULL,
                               restore = FALSE) {
    s <- e <- rep(NA, length = nrow(read_seeds))

    if (!restore) {
        ps <- read_seeds$strand == '+'
        s[ps] <- read_seeds[ps, ]$start
        e[ps] <- read_seeds[ps, ]$start + read_seeds[ps, ]$length - 1
        ms <- read_seeds$strand == '-'
        s[ms] <- read_seeds[ms, ]$start - read_seeds[ms, ]$length + 1
        e[ms] <- read_seeds[ms, ]$start
    } else {
        s <- pos$start
        e <- pos$end
        s <- s - seq_length
        s[s < 1] <- seq_length + s[s < 1]
        e <- seq_length - e
        e[e < 0] <- - e[e < 0]
    }

    data.frame(start = s, end = e, strand = read_seeds$strand)
}


#' @importFrom methods getPackageName
#' @importFrom utils packageVersion
#' @importFrom tools file_ext
#' @importFrom Biostrings BStringSet DNAStringSet
#' @importFrom ShortRead ShortReadQ writeFastq
.syn_save_reads <- function(read_seq, fastq_file, read_name_prefix) {
    remove_files(fastq_file)
    read_qual <- apply(as.matrix(read_seq), 1, function(x) {
        paste0(rep('I', length = nchar(x)), collapse = '')
    })
    n_digits <- paste0('%0', floor(log10(length(read_seq)) + 1), 'd')


    read_name <- NULL
    if (is.null(read_name_prefix)) {
        pkg <- getPackageName()
        if (pkg == '' || pkg == '.GlobalEnv') {
            pkg <- 'CircSeqAlignTkSim'
            pkg_version <- '0.0.0'
        } else {
            pkg_version <- packageVersion(pkg)
        }
        read_name <- paste0('SYN:', pkg, ':', pkg_version, ':',
            paste0(sample(c(letters, LETTERS), 10, replace = TRUE), collapse = ""),
            ':', sprintf(n_digits, seq(1, length(read_seq))))
    } else {
        read_name <- paste0(read_name_prefix, ':',
                            sprintf(n_digits, seq(1, length(read_seq))))
    }
    reads <- ShortReadQ(DNAStringSet(read_seq),
                        BStringSet(read_qual),
                        BStringSet(read_name))

    writeFastq(reads, fastq_file, mode = 'w', full = FALSE,
               compress = ifelse (file_ext(fastq_file) %in% c('gz', 'gzip'), TRUE, FALSE))
    invisible(fastq_file)
}




.syn_add_adapter <- function(srna, adapter, read_length) {
    if (is.na(adapter) || adapter == '') {
        reads <- srna
    } else {
        # add random bases after adapter
        rb <- paste0(sample(c('A', 'C', 'G', 'T'), read_length, replace = TRUE),
                     collapse = '')
        rb_len <- read_length -
            as.numeric(vapply(srna, nchar, 1)) - nchar(adapter)
        rb_len[rb_len < 0] <- 0

        # add adapter and random bases
        reads <- paste0(srna, adapter, substring(rb, 1, rb_len))
    }
    reads
}


.syn_introduce_mismatches <- function(srna, mismatch_prob) {
    if (is.null(mismatch_prob) ||
        (length(mismatch_prob) == 1 && mismatch_prob == 0))
        return(srna)

    message('Introducing mismatches into reads ...')

    # sample which reads should contain mismatches
    bb <- matrix(FALSE, nrow = length(srna), ncol = length(mismatch_prob))
    for (j in seq(ncol(bb))) {
        if (j == 1) {
            .bb <- rep(TRUE, length(srna))
        } else {
            .bb <- bb[, j - 1]
        }
        .bb[.bb] <- (runif(length(.bb[.bb])) < mismatch_prob[j])
        bb[, j] <- .bb
    }

    # introduce mismatches
    srna <- apply(as.matrix(seq(1, length(srna))), 1, function(i, srna, bb) {
        s <- srna[i]
        b <- bb[i, ]
        if (any(b)) {
            pos <- sample(seq(1, nchar(s)), length(b[b]), replace = TRUE)
            for (p in pos) {
                substr(s, p, p) <- sample(setdiff(c('A', 'C', 'G', 'T'),
                                                  substr(s, p, p)), 1)
            }
        }
        s
    }, srna = srna, bb = bb)

    srna
}



#' @importFrom IRanges IRanges coverage
.syn_calc_coverage <- function(srna_pos, seq_length) {
    srna_pos$length <- srna_pos$end - srna_pos$start + 1
    read_length <- sort(unique(srna_pos$length))
    max_pos <- max(c(srna_pos$start + srna_pos$length, seq_length))
    mat_fwd <- matrix(0, nrow = max_pos, ncol = length(read_length))
    mat_rev <- matrix(0, nrow = max_pos, ncol = length(read_length))
    colnames(mat_fwd) <- paste0('L', read_length)
    colnames(mat_rev) <- paste0('L', read_length)

    for (i in seq(read_length)) {
        # forward strand
        k <- ((srna_pos$length == read_length[i]) &
                  (srna_pos$strand == '+'))
        mat_fwd[, i] <- as.integer(
            coverage(IRanges(srna_pos$start[k], width = srna_pos$length[k]),
                     width = max_pos))
        # reverse strand
        k <- ((srna_pos$length == read_length[i]) &
                  (srna_pos$strand == '-'))
        mat_rev[, i] <- as.integer(
            coverage(IRanges(srna_pos$start[k], width = srna_pos$length[k]),
                     width = max_pos))
    }

    for (i in seq(max_pos %/% seq_length)) {
        rowf <- seq_length * i + 1
        rowt <- ifelse(max_pos < seq_length * (i + 1),
                       max_pos, seq_length * (i + 1))
        mat_fwd[seq(1, rowt - rowf + 1), ] <- mat_fwd[seq(rowf, rowt), ] +
            mat_fwd[seq(1, rowt - rowf + 1), ]
        mat_rev[seq(1, rowt - rowf + 1), ] <- mat_rev[seq(rowf, rowt), ] +
            mat_rev[seq(1, rowt - rowf + 1), ]
    }
    mat_fwd <- mat_fwd[seq(1, seq_length), ]
    mat_rev <- mat_rev[seq(1, seq_length), ]

    new('CircSeqAlignTkCoverage',
        forward = mat_fwd,
        reverse = mat_rev,
        .figdata = rbind(reshape_coverage_df(mat_fwd, '+'),
                         reshape_coverage_df(mat_rev, '-')))
}


.syn_arrange_readseeds <- function(read_seeds, seq_length, srna_pos, srna) {
    read_seeds$start <- srna_pos$start
    read_seeds$mean <- read_seeds$mean - seq_length
    read_seeds <- data.frame(read_seeds, end = srna_pos$end, sRNA = srna,
                             row.names=NULL)
    read_seeds <- read_seeds[, c('mean', 'std', 'strand', 'prob',
                                 'start', 'end', 'sRNA', 'length')]
    read_seeds
}


#' Merge multiple synthetic datasets
#'
#' Merge multiple synthetic datasets generated by \code{\link{generate_reads}}.
#'
#' Merge multiple synthetic datasets generated by \code{\link{generate_reads}}
#' into one dataset.
#'
#' @param ... CircSeqAlignTkSim class objects.
#' @param output A file path to store the synthetic reads in FASTQ format file.
#'     The extension should be one of `.fq`, `.fastq`.
#'     Note that to compress the FASTQ format file, add `.gz` or `.gzip` to the
#'     extension (e.g., `.fq.gz`, `.fq.gzip`).
#' @param overwrite Overwrite the existing files if `TRUE`.
#' @return A \code{\link{CircSeqAlignTkSim-class}} object.
#' @seealso \code{\link{CircSeqAlignTkSim-class}}, \code{\link{generate_reads}}
#' @examples
#' output_dpath <- tempdir()
#'
#' sim_params_1 <- data.frame(length = c(21, 22), prob = c(0.5, 0.4))
#' sim_1 <- generate_reads(n = 5e2,
#'                         output = file.path(output_dpath, 'sample1.fq.gz'),
#'                         srna_length = sim_params_1)
#'
#' sim_params_2 <- data.frame(length = c(19, 20, 23), prob = c(0.2, 0.7, 0.1))
#' sim_2 <- generate_reads(n = 5e2,
#'                         output = file.path(output_dpath, 'sample2.fq.gz'),
#'                         srna_length = sim_params_2)
#'
#' sim <- merge(sim_1, sim_2, output = file.path(output_dpath, 'sample.fq.gz'))
#' @importFrom Biostrings readDNAStringSet reverseComplement DNAStringSet
#' @export
merge.CircSeqAlignTkSim <- function(..., output = NULL, overwrite = TRUE) {
    if (is.null(output)) stop('Specify a file path to save the merged data.')
    check_overwrite(output, overwrite)

    objs <- list(...)

    genome_seq <- unique(vapply(objs, function(x) x@seq, character(1)))
    if (length(genome_seq) > 1) stop('The synthtetic reads should be generated from a single sequence.')

    merged_obj <- objs[[1]]
    merged_obj@fastq  <- output
    .merge_fastq(objs[[1]]@fastq, merged_obj@fastq)
    for (i in seq(2, length(objs))) {
        merged_obj@adapter <- c(merged_obj@adapter, objs[[i]]@adapter)
        merged_obj@read_info <- rbind(merged_obj@read_info, objs[[i]]@read_info)
        merged_obj@peak <- rbind(merged_obj@peak, objs[[i]]@peak)
        merged_obj@coverage <- .merge_coverage(merged_obj@coverage, objs[[i]]@coverage)
        .merge_fastq(objs[[i]]@fastq, merged_obj@fastq)
    }

    merged_obj
}


.merge_coverage <- function(x, y) {
    .merge_coverage_matrix <- function(x, y) {
        if (nrow(x) != nrow(y)) stop('The alignment coverages may not have been calculated from the same reference sequence.')
        srna_len <- sort(unique(c(colnames(x), colnames(y))))
        merged_coverage <- matrix(0, ncol = length(srna_len), nrow = nrow(x))
        colnames(merged_coverage) <- srna_len
        merged_coverage[, colnames(x)] <- x
        merged_coverage[, colnames(y)] <- merged_coverage[, colnames(y)] + y
        merged_coverage
    }

    mat_fwd <- .merge_coverage_matrix(x@forward, y@forward)
    mat_rev <- .merge_coverage_matrix(x@reverse, y@reverse)

    new('CircSeqAlignTkCoverage',
        forward = mat_fwd,
        reverse = mat_rev,
        .figdata = rbind(reshape_coverage_df(mat_fwd, '+'),
                         reshape_coverage_df(mat_rev, '-')))
}


.merge_fastq <- function(input, output) {
    c_mode <- ifelse (file_ext(output) %in% c('gz', 'gzip'), TRUE, FALSE)
    fh <- FastqStreamer(input, n = 1e6)
    while (length(fq <- yield(fh))) {
        writeFastq(fq, output, mode = 'a', full = FALSE, compress = c_mode)
    }
    close(fh)
}



