#' @title  Spaced Words Projection
#' @name SWeeP
#' @description 
#'
#' Spaced Words Projection version (SWeeP) is an alignment-free method for the vector representation of the 
#' biological sequences (amino acid and nucleotide). The ´SWeeP´ is an R implementation of the SWeeP method 
#' (De Pierri, 2020). Each sequence provided is represented by a compact numerical vector which is easy to analyze. 
#' The method is based on k-mers counting and random projection. For the analysis of biological sequences, this 
#' function requires you to supply the orthonormal matrix, which can be obtained by the 'orthBase' function 
#' as in the example. Details of the methodology can be found in the reference (De Pierri, 2020). 
#' The function allows general dimensionality reduction of RNAseq data and generic matrices.
#'
#' @param input There are four input formats available:
#'              (a) `BStringSet' (variants: `AAStringSet', `RNAStringSet', `DNAStringSet'). Biological sequence format loaded in memory;
#'              (b) `character'. String containing a path to a folder with FASTA files;
#'              (c) `dgCMatrix'. Expression matrix loaded with Seurat package (mtx pattern).
#'              (d) `matrix' (variants: `array',`integer'). Generic matrix.
#' @param orthbase   the orthonormal projection matrix generated by the orthBase() function. 
#' @param extension  extension of files desired to concatenate (Optional).  Available only for input type path to folder with FASTA files.
#' @param mask		reading mask. Available only for inputs in biological sequence format or path for FASTA files. 
#'                  Default for amino acids is `c(2,1,2)` and for nucleotides c(5,5,5)
#' @param bin       binary mode (TRUE), or count mode (FALSE) for HDV construction. Default is FALSE.
#' @param seqtype   type of data: ´AA´ for amino acid, ´NT´ for nucleotide. Available only for inputs in biological sequence format or path for FASTA files. The default is AA
#' @param lowRAMmode   lowRAMmode is suitable for reading large files individually, such as complete genomes, when the machine's memory is limited. 
#'                     read one FASTA at a time, recommended for large files such as complete eukaryotic genomes or proteomes. The default is FALSE
#' @param transpose   If the rows correspond to the samples and the columns correspond to the genes (mtx pattern), 
#'                    use transpose=FALSE. If the columns correspond to the samples, use transpose=TRUE. 
#'                    Available only for inputs of the expression matrix or generic matrix type. 
#'                    The default setting is FALSE
#' @param concatenate   defines whether to treat each sequence individually or to concatenate them into a single sequence.
#'                      Available only for inputs in biological sequence format. The default is FALSE.
#' @param norm   normalization of HDV. This must be one of 'none', 'log' or 'logNeg'. 'none' is no normalization, 
#'               'log' is simple logarithm, ´Neg´ to convert nulls into -1, ´logNeg´ option is indicated for analyzing genes and short sequences.
#'               Default is ´none´.
#' @param RNAseqdata  For RNAseq data use 'TRUE' or apply the parameter `transpose=TRUE'. Default is FALSE.
#' @param ncores    Number of CPU cores used for parallel processing. Default is 2.
#' @param verbose   verbose mode. The default is TRUE
#' @param ...       other arguments of the function itself
#' 
#' @return 
#' `SWeeP' returns a `list' containing the following components:
#' \itemize{
#'   \item proj: a `numeric` matrix with `m` columns and one line per sequence, each row corresponding to a compact vector
#'   \item info: aditional information of the process. This object is subdivided in: 
#'   \itemize{
#'   \item ProjectionSize: a `integer` corresponding to `psz`
#'   \item bin: bin: a `boolean` containing if binary (TRUE) or counting (FALSE)
#'   \item mask: a `vector` containing the mask used
#'   \item SequenceType: a `character' containing the type of the sequence (amino acid: AA, ou nucleotide: NT)
#'   \item concatenate : a `boolean` corresponding to the concatenation of sequences
#'   \item version : a `character` corresponding to the version of the package
#'   \item norm : a `character` containing the normalization used
#'   \item extension: a `character` containing the list of extensions considered
#'   \item timeElapsed: a `double` containing the elapsed time in seconds
#'   \item headers : `list` of headers for each analyzed sequence
#' } 
#' } 
#' 
#' @author Camila P. Perico
#' 
#' 
#' @examples
#' 
#' # get the path to the folder containing the FASTA files
#' path = paste (system.file("examples/aaMitochondrial/",package = "rSWeeP"),'/', sep = '')
#' 
#' # define the parameters
#' mask = c(2,1,2)
#' psz = 500
#' 
#' # get the basis matrix to projection
#' base160k = orthBase(160000,psz)
#' 
#' # get the vectors that represent the sequences
#' LDV = SWeeP(input=path,orthbase=base160k,extension=c('.faa','.fas','.fasta'),
#'             mask=mask,seqtype='AA',ncores=2)
#' 
#' @details
#' 
#' The normalization option 'logNeg' applies a simple logarithm to the HDV matrix. 
#' Its difference from 'log' is the conversion of zeros to -1 in HDV.
#' 
#' @references
#' De Pierri, C. R., et al. (2020). SWeeP: representing large biological sequences datasets in compact vectors. 
#' Scientific reports, 10(1):1–10.
#' 
#' @import foreach doParallel methods parallel utils
#' @importFrom Biostrings readBStringSet
#' @importFrom Biostrings BStringSet
#' @export
setGeneric(
  "SWeeP",
  function(input,orthbase,bin=FALSE,...) standardGeneric("SWeeP"), # os ... para denotar outros parametros variaveis
    signature = "input"
)


.sweepFASTAinRAM <- function(input,orthbase,bin=FALSE,ncores=NULL,norm='none',mask=NULL,seqtype=NULL,
                                concatenate=FALSE,verbose=TRUE) { # [OK A TESTADO!]
    
    start_time = proc.time()


    SW.checks('mask',mask)
    mask = Defmask(mask,seqtype)

    if (inherits(input,"AAStringSet"))                                      { seqtype='AA' }
    else if (inherits(input,"RNAStringSet") | inherits(input,"DNAStringSet")) { seqtype='NT' }
    else if (is.null(seqtype)){ 
        stop("Please provide the type of data. Use seqtype='AA' for amino acids and seqtype='NT' for nucleotides. ")
    }

    ## input checks ---------------------- vv
    SW.checks('norm',norm)
    SW.checks('seqtype',seqtype)
    SW.checks('bin',bin)
    SW.checks('concatenate',concatenate)
    ## input checks ---------------------- ^^

    if (seqtype=='AA'){
        lenmax = 20^sum(mask)
    } else if(seqtype=='NT'){
        lenmax = 4^sum(mask)
    }
    
    mask = as.integer(mask)
    spacer = paste(rep("*",length(mask)),collapse='')

    if(concatenate){ # concatenate with spacer
        input = stringi::stri_join_list(list(paste(input)),sep = spacer,collapse = NULL) # require stringi
        cat("Note that the 'concatenate' option will concatenate all the samples in your input.\n")
        # Sys.sleep(2)
    }

    # # number of sequences
    N = length(input)
    # N_ = as.character(N)
    
    cat('Starting projection. Please wait.\n')


    hdv_vec = HDVparallel(N,ncores,input,seqtype,mask,bin,norm,lenmax)

    print(dim(hdv_vec))
    
    output = NULL

    cat('Projecting ...\n')

    output$proj = hdv_vec %*% orthbase$mat

    end_time = proc.time()
    output = createSWobj(output,dim(orthbase$mat)[2],names(input),mask,bin,seqtype,NULL,
                            concatenate,norm,(end_time - start_time)[[3]])


    rownames(output$proj) = output$info$headers
    return(output)


} # end .sweepFASTAinRAM










.sweepFASTAfromFolder <- function(input,orthbase, bin=FALSE,norm='none',ncores=NULL,extension='',mask=NULL,seqtype='AA',
                                    lowRAMmode=FALSE,verbose=TRUE) {
    start_time = proc.time()

    mask = Defmask(mask,seqtype)


    ## Create the lists of files and names
    fastalist = list.files(input,pattern=extension,full.names=TRUE,recursive=TRUE)
    namesfasta = list.files(input,pattern=extension,full.names=FALSE,recursive=TRUE)
    # remove extension from filename
    namesfasta = tools::file_path_sans_ext(namesfasta)   
    
    if (seqtype=='AA'){
        lenmax = 20^sum(mask)
    } else if(seqtype=='NT'){
        lenmax = 4^sum(mask)
    }
    # number of sequences
    N = length(fastalist)
    N_ = as.character(N)

    mask = as.integer(mask)
    spacer = paste(rep("*",length(mask)),collapse='')


    ## input checks ---------------------- vv
    SW.checks('norm',norm)
    SW.checks('extension',extension)
    SW.checks('mask',mask)
    SW.checks('seqtype',seqtype)
    SW.checks('bin',bin)
    SW.checks('lowRAMmode',lowRAMmode)
    SW.checks('fastalist',fastalist)
    ## input checks ---------------------- ^^

    

    if (!lowRAMmode) {
        allFastas = concatenaAll(fastalist,spacer,N)
        #  format AAStrings, call .sweepFASTAinRAM function
        output = SWeeP(allFastas,orthbase,mask=mask,seqtype=seqtype,ncores=ncores,concatenate=FALSE,
                        norm=norm,bin=bin,verbose=verbose)

    } else {
        
        output = NULL
        output$proj    = matrix(nrow=N,ncol=dim(orthbase$mat)[2]) # psz == dim(orthbase)[2]
        
        # for each fasta file, sweep
        for (k in 1:N){
                if(verbose){
                cat(paste('starting sequence ',as.character(k),'of', N_))
            }

            actualfasta = concatenaEach(fastalist[k],spacer)
            seq=NULL # necessary
            seq = seq2num(actualfasta,seqtype)
            hdv_vec = readmaskVEC(seq, mask,seqtype,bin,norm,lenmax) 
        

            cat('Projecting ...\n')
            output$proj[k,] = hdv_vec %*% orthbase$mat


            # atual sequence
            if(verbose){
                cat(paste(' - complete\n'))
            }

        } # end for k in N


        end_time = proc.time()
        output = createSWobj(output,dim(orthbase$mat)[2],names(input),mask,bin,seqtype,extension,
                        FALSE,norm,(end_time - start_time)[[3]])


    

    }
        output$info$headers = namesfasta # NULL if concatenated

    rownames(output$proj) = output$info$headers
    return(output)

} # end .sweepFASTAfromFolder










.sweepGeneric <- function(input,orthbase,bin=FALSE,transpose=FALSE,RNAseqdata=FALSE, norm='none',verbose=TRUE){ # [OK A TESTAR]
    
    start_time = proc.time()

    if(transpose==FALSE & sum(inherits(input,'dgCMatrix'))==1){
        cat("Caution! \nIf your input is of the RNAseq type, probably each column contains a sample, and each row a gene. 
In this case, use the 'transpose=TRUE' option.\n\n")
        Sys.sleep(2)
    }
    
    ## input checks ---------------------- vv
    SW.checks('norm',norm)
    SW.checks('bin',bin)
    SW.checks('RNAseqdata',RNAseqdata)
    SW.checks('transpose',transpose)
    ## input checks ---------------------- ^^
    
    output = NULL
    output$info = NULL
    output$info$ProjectionSize =  dim(orthbase$mat)[2]
    output$info$transpose =  transpose
    output$info$RNAseqdata =  RNAseqdata


    if (transpose | RNAseqdata){ # need to transpose, col=sample
        input = t(as.matrix(input))
    }
    output$info$features = colnames(input)
    output$info$samples = rownames(input)
    output$info$norm =  norm


    ## input checks ---------------------- vv
    SW.checkORTHBASE(orthbase$mat, input)
    ## input checks ---------------------- ^^

    input = MakeLOG(input,bin,norm)

    cat('Starting projection. Please wait.\n')

    cat('Projecting ...\n')
    output$proj = input %*% orthbase$mat

    end_time = proc.time()
    output$info$timeElapsed = (end_time - start_time)[[3]] 
    output$info$version = utils::packageVersion('rSWeeP')

    rownames(output$proj) = rownames(input)
    return(output)

} # end .sweepGeneric




#' @rdname SWeeP
setMethod("SWeeP", "AAStringSet",  .sweepFASTAinRAM) 
#' @rdname SWeeP
setMethod("SWeeP", "DNAStringSet", .sweepFASTAinRAM) 
#' @rdname SWeeP
setMethod("SWeeP", "RNAStringSet", .sweepFASTAinRAM) 
#' @rdname SWeeP
setMethod("SWeeP", "BStringSet",   .sweepFASTAinRAM) 
#' @rdname SWeeP
setMethod("SWeeP", "BString",      .sweepFASTAinRAM) 
#' @rdname SWeeP
setMethod("SWeeP", "character",    .sweepFASTAfromFolder) 
#' @rdname SWeeP
setMethod("SWeeP", "array",        .sweepGeneric) 
#' @rdname SWeeP
setMethod("SWeeP", "integer",      .sweepGeneric) 
#' @rdname SWeeP
setMethod("SWeeP", "matrix",       .sweepGeneric) 
#' @rdname SWeeP
setMethod("SWeeP", "dgCMatrix",    .sweepGeneric) 

















# ############################################################################################### #
#                              (__)                                                               #
#                              (oo)                                                               #
#                        /------\/                                                                #
#                       / |    ||                                                                 #
#                      *  /\---/\                                                                 #
#                         ~~   ~~                                                                 #
#             ..."Have you mooed today?"...                                                       #
# ############################################################################################### #










#' @title  Spaced Words Projection lite
#' @name SWeePlite
#' @description 
#'
#' Spaced Words Projection version lite (SWeePlite) is an alignment-free method for the vector representation 
#' of the biological sequences (amino acid and nucleotide). Analogous to the ´SWeeP´ function (De Pierri, 2020), 
#' ´SWeePlite´ has optimizations in its implementation that allow the use of larger read masks with low RAM 
#' consumption. It also eliminates the need to supply the orthonormal matrix (it is generated internally).
#' Each sequence provided is represented by a compact numerical vector which is easy to analyze. 
#' The method is based on k-mers counting and random projection. Details of the methodology can be found 
#' in the reference (De Pierri, 2020). The function allows general dimensionality reduction of RNAseq 
#' data and generic matrices.
#' 
#' @param input There are four input formats available:
#'              (a) `BStringSet' (variants: `AAStringSet', `RNAStringSet', `DNAStringSet'). Biological sequence format loaded in memory;
#'              (b) `character' String containing a path to a folder with FASTA files;
#'              (c) `dgCMatrix' Expression matrix loaded with Seurat package (mtx pattern).
#'              (d) `matrix' (variants: `array',`integer'). Generic matrix.
#' @param extension         extension of files desired to concatenate (Optional).  Available only for input type path to folder with FASTA files.
#' @param psz       projection size. Default 1369
#' @param mask      reading mask. Available only for inputs in biological sequence format or path for FASTA files. Default c(2,1,2)
#' @param seqtype   type of data: ´AA´ for amino acid, ´NT´ for nucleotide. Available only for inputs in biological sequence format or path for FASTA files. The default is AA
#' @param bin     binary mode (TRUE), or counting mode (FALSE) for HDV construction. Default is FALSE.
#' @param lowRAMmode   lowRAMmode is suitable for reading large files individually, such as complete genomes, when the machine's memory is limited. 
#'                     read one FASTA at a time, recommended for large files such as complete eukaryotic genomes or proteomes. The default is FALSE
#' @param transpose   If the rows correspond to the samples and the columns correspond to the genes (mtx pattern), 
#'                    use transpose=FALSE. If the columns correspond to the samples, use transpose=TRUE. 
#'                    Available only for inputs of the expression matrix or generic matrix type. 
#'                    The default setting is FALSE
#' @param concatenate   defines whether to treat each sequence individually or to concatenate them into a single sequence
#'                      Available only for inputs in biological sequence format. The default is FALSE.
#' @param norm   normalization of HDV. This must be one of 'none', 'log' or 'logNeg'. 'none' is no normalization, 
#'               'log' is simple logarithm, ´Neg´ to convert nulls into -1, ´logNeg´ option is indicated for analyzing genes and short sequences.
#'               Default is ´none´.
#' @param RNAseqdata  For RNAseq data use 'TRUE' or apply the parameter `transpose=TRUE'. Default is FALSE.
#' @param ncores    Number of CPU cores used for parallel processing. Default is 2.
#' @param nk        Step size of HDV for parallel loop. Default is 50000.
#' @param verbose   verbose mode. The default is TRUE
#' @param ...       other arguments of the function itself
#' 
#' @return 
#' `SWeePlite' returns a `list' containing the following components:
#' \itemize{
#'   \item proj: a `numeric` matrix with `m` columns and one line per sequence, each row corresponding to a compact vector
#'   \item info: aditional information of the process. This object is subdivided in: 
#'   \itemize{
#'   \item ProjectionSize: a `integer` corresponding to `psz`
#'   \item bin: bin: a `boolean' containing if binary (TRUE) or counting (FALSE)
#'   \item mask: a `vector` containing the mask used
#'   \item SequenceType: a `character' containing the type of the sequence (amino acid: AA, ou nucleotide: NT)
#'   \item concatenate : a `boolean` corresponding to the concatenation of sequences
#'   \item version : a `character` corresponding to the version of the package
#'   \item norm : a `character` containing the normalization used
#'   \item extension: a `character' containing the list of extensions considered
#'   \item timeElapsed: a `double' containing the elapsed time in seconds
#'   \item headers : list of headers for each analyzed sequence
#' } 
#' } 
#'  
#' @author Camila P. Perico
#' 
#' @examples
#' 
#' # get the path to the folder containing the FASTA files
#' path = paste (system.file("examples/aaMitochondrial/",package = "rSWeeP"),'/', sep = '')
#' 
#' # define the parameters
#' mask = c(2,1,2)
#' psz = 1369
#' 
#' # get the vectors that represent the sequences
#' LDV = SWeePlite(input=path,extension=c('.faa','.fas','.fasta'),
#'                 psz = psz,mask=mask,bin=FALSE,seqtype='AA',ncores=2)
#' 
#' @details
#' The normalization option 'logNeg' applies a simple logarithm to the HDV matrix. 
#' Its difference from 'log' is the conversion of zeros to -1 in HDV.
#' 
#' @references
#' De Pierri, C. R., et al. (2020). SWeeP: representing large biological sequences datasets  
#' in compact vectors. Scientific reports, 10(1):1–10.
#' 
#' @rdname SWeePlite
#' @import foreach doParallel methods parallel utils
#' @importFrom Biostrings readBStringSet
#' @importFrom Biostrings BStringSet

#' @export
setGeneric(
  "SWeePlite",
  function(input, psz,bin=FALSE,ncores=NULL,...) standardGeneric("SWeePlite"),
    signature = "input"
)

.SWeePliteFASTAinRAM <-function(input,psz=1369,bin=FALSE,ncores=NULL,norm='none',concatenate=FALSE, 
                                mask=NULL,seqtype=NULL, nk=15000,verbose=TRUE){ # [ ]
    start_time = proc.time()

    mask = Defmask(mask,seqtype)

    if (inherits(input,"AAStringSet"))                                      { seqtype='AA' }
    else if (inherits(input,"RNAStringSet") | inherits(input,"DNAStringSet")) { seqtype='NT' }
    else if (is.null(seqtype)){ 
        stop("Please provide the type of data. Use seqtype='AA' for amino acids and seqtype='NT' for nucleotides. ")
    }

    ## input checks ---------------------- vv
    SW.checks('norm',norm)
    SW.checks('mask',mask)
    SW.checks('seqtype',seqtype)
    SW.checks('bin',bin)
    SW.checks('psz',psz)
    SW.checks('concatenate',concatenate)
    ## input checks ---------------------- ^^

    
    if(concatenate){ # concatenate with spacer
        spacer = paste(rep("*",length(mask)),collapse='')
        input = stringi::stri_join_list(list(paste(input)),sep = spacer,collapse = NULL) # require stringi
    }

    # PARAMETERS --------------
    N = length(input) # number of sequences
    N_ = as.character(N)
    lenmax <- if (seqtype == 'AA') 20^sum(mask) else if (seqtype == 'NT') 4^sum(mask)
    par = liteParam(mask,input,seqtype,N,psz,lenmax)
    mask = as.integer(mask)

    # DEFINE the OUTPUT VARIABLE
    output = NULL
    output$info = NULL
    output$proj    = matrix(nrow=N,ncol=psz)

    # START PARALLEL 
    ncores = NCoresDef(ncores)
    sw.cluster <- parallel::makeCluster(ncores, type = "PSOCK")
    doParallel::registerDoParallel(cl = sw.cluster)
    foreach::getDoParWorkers()

    cat('Starting projection. Please wait.\n')

    for (k in 1:N){
        if(verbose){ cat(paste('starting sequence ',as.character(k),'of', N_)) }

        seq = seq2num(input[k],seqtype)

        hdv_vec = readmask(seq, mask,seqtype,bin,norm,lenmax)            

        output$proj[k,] = COREloop(par$xnorm,psz,par$Mproj,par$pslist,par$nps,hdv_vec,nk,bin,norm)

        # atual sequence
        if(verbose){ cat(paste(' - complete\n')) }

    } # end for k in N
  
    # STOP PARALLEL 
    parallel::stopCluster(cl = sw.cluster)
      

    # OUTPUT
    row.names(output$proj) = output$info$samples

    output$info$headers = names(input) # NULL se houver concatenado
    output$info$ProjectionSize =  psz
    output$info$mask = mask
    output$info$SequenceType = seqtype
    output$info$extension = NULL
    output$info$concatenate =  concatenate
    output$info$bin = ifelse(bin, 'binary (TRUE)', 'counting (FALSE)')
    output$info$version = utils::packageVersion('rSWeeP')
    output$info$norm =  norm
    
    end_time = proc.time()
    output$info$timeElapsed = (end_time - start_time)[[3]] # as.double(end_time - start_time)

    rownames(output$proj) = output$info$headers

    return(output)

} # end .SWeePliteFASTAinRAM










.SWeePliteFASTAfromFolder <-function(input,psz=1369,bin=FALSE,ncores=NULL,norm='none', mask=NULL,extension='',
                                        seqtype='AA',lowRAMmode=TRUE,nk=15000,verbose=TRUE){ # [ ]
    start_time = proc.time()

    mask = Defmask(mask,seqtype)

    ## Create the lists of files and names
    fastalist = list.files(input,pattern=extension,full.names=TRUE,recursive=TRUE)
    namesfasta = list.files(input,pattern=extension,full.names=FALSE,recursive=TRUE)
    # remove extension from filename
    namesfasta = tools::file_path_sans_ext(namesfasta)   
    

    ## input checks ---------------------- vv
    SW.checks('ncores',ncores)
    SW.checks('norm',norm)
    SW.checks('extension',extension)
    SW.checks('psz',psz)
    SW.checks('mask',mask)
    SW.checks('seqtype',seqtype)
    SW.checks('bin',bin)
    SW.checks('lowRAMmode',lowRAMmode)
    SW.checks('fastalist',fastalist)
    ## input checks ---------------------- ^^

    # number of sequences
    N = length(fastalist)
    N_ = as.character(N)

    spacer = paste(rep("*",length(mask)),collapse='')

    if (!lowRAMmode){ # carrego todas as seqs na memória

        allFastas = concatenaAll(fastalist,spacer,N)
        output = SWeePlite(allFastas,psz=psz,bin=bin,concatenate=FALSE, norm=norm,mask=mask,seqtype=seqtype, verbose=verbose)
        output$info$headers = namesfasta # NULL se houver concatenado

    } else {
        cat('Starting projection. Please wait.\n')

        lenmax <- if (seqtype == 'AA') 20^sum(mask) else if (seqtype == 'NT') 4^sum(mask)
        par = liteParam(mask,input,seqtype,N,psz,lenmax)
        mask = as.integer(mask)

        output = NULL
        output$proj    = matrix(nrow=N,ncol=psz)

        # START PARALLEL 
        ncores = NCoresDef(ncores)
        sw.cluster <- parallel::makeCluster(ncores, type = "PSOCK")
        doParallel::registerDoParallel(cl = sw.cluster)
        foreach::getDoParWorkers()

        for (k in 1:N){
            if(verbose){
                cat(paste('starting sequence ',as.character(k),'of', N_))
            }
            
            actualfasta = concatenaEach(fastalist[k],spacer)
            seq = seq2num(actualfasta,seqtype)

            hdv_vec = readmask(seq, mask,seqtype,bin,norm,lenmax)


            output$proj[k,] = COREloop(par$xnorm,psz,par$Mproj,par$pslist,par$nps,hdv_vec,nk,bin,norm)

            # atual sequence
            if(verbose){
                cat(paste(' - complete\n'))
            }
        } # end for k in N

        # STOP PARALLEL 
        parallel::stopCluster(cl = sw.cluster)

        # row.names(output$proj) = output$info$samples

        output$info$headers = namesfasta # NULL se houver concatenado
        output$info$ProjectionSize =  psz
        output$info$bin = ifelse(bin, 'binary (TRUE)', 'counting (FALSE)')
        output$info$mask = mask
        output$info$SequenceType = seqtype
        output$info$extension = extension
        output$info$bin = ifelse(bin, 'binary (TRUE)', 'counting (FALSE)')
        output$info$version = utils::packageVersion('rSWeeP')
        output$info$norm =  norm

    } # end else
    
    end_time = proc.time()
    output$info$timeElapsed = (end_time - start_time)[[3]] # as.double(end_time - start_time)

    rownames(output$proj) = output$info$headers
    return(output)
} # end .SWeePliteFASTAfromFolder








.SWeePliteGeneric <-function(input,psz=1369, bin=FALSE,ncores=NULL,transpose=FALSE, RNAseqdata=FALSE, norm='none',
                                nk=15000,verbose=TRUE){ # [OK ]
    
    start_time = proc.time()

    if(transpose==FALSE & sum(inherits(input,'dgCMatrix'))==1){
        cat("Caution! \nIf your input is of the RNAseq type, probably each column contains a sample, and each row a gene. 
In this case, use the 'transpose=TRUE' option.\n\n")
        Sys.sleep(2)
    }

    ## input checks ---------------------- vv
    SW.checks('ncores',ncores)
    SW.checks('psz',psz)
    SW.checks('norm',norm)
    SW.checks('bin',bin)
    SW.checks('RNAseqdata',RNAseqdata)
    SW.checks('transpose',transpose)
    ## input checks ---------------------- ^^

     
    
    # empty output projection matrix
    output = NULL # create an output object
    output$info = NULL # create an output object

    if (transpose | RNAseqdata){ # se precisar transpor, col=sample
        N = dim(input)[2]
        N_ = as.character(N)
        lenmax = dim(input)[1]
        output$info$features = rownames(input)
        output$info$samples = colnames(input)
    
    } else { # convencional de matriz - row=sample
        N = dim(input)[1]
        N_ = as.character(N)
        lenmax = dim(input)[2]
        output$info$features = colnames(input)
        output$info$samples = rownames(input)
    
    }

    output$proj    = matrix(nrow=N,ncol=psz)
    output$info$ProjectionSize =  psz
    output$info$bin = ifelse(bin, 'binary (TRUE)', 'counting (FALSE)')
    output$info$RNAseqdata =  RNAseqdata
    output$info$transpose =  transpose
    output$info$norm =  norm

    par = liteParam(mask,input,seqtype,N,psz,lenmax)

    # START PARALLEL 
    ncores = NCoresDef(ncores)
    sw.cluster <- parallel::makeCluster(ncores, type = "PSOCK")
    doParallel::registerDoParallel(cl = sw.cluster)
    foreach::getDoParWorkers()
    
    for (k in 1:N){
        if(verbose){ cat(paste('starting sequence ',as.character(k),'of', N_)) }
        
        # if(bin){   input[input!=0] = 1   } # não precisa, no COREloop se for bin=true nem usa o hdv_vec$count

        hdv_vec = NULL
        if (transpose | RNAseqdata){ # se precisar transpor, col=sample
            aux = input[,k]
            if(norm!='none'){ aux = MakeLOG(aux,bin,norm) }
            hdv_vec$idx = as.vector(which(aux!=0))
            if(!bin){ hdv_vec$count = as.vector(input[aux!=0,k]) }
        } else {
            aux = input[k,]
            if(norm!='none'){ aux = MakeLOG(aux,bin,norm) }
            hdv_vec$idx = as.vector(which(aux!=0))
            if(!bin){ hdv_vec$count = as.vector(input[k,aux!=0]) }
        }
        
        output$proj[k,] = COREloop(par$xnorm,psz,par$Mproj,par$pslist,par$nps,hdv_vec,nk,bin,norm)


        # atual sequence
        if(verbose){ cat(paste(' - complete\n')) }

    } # end for k in N
  
    # STOP PARALLEL 
    parallel::stopCluster(cl = sw.cluster)
    
    row.names(output$proj) = output$info$samples
    output$info$version = utils::packageVersion('rSWeeP')
    
    end_time = proc.time()
    output$info$timeElapsed = (end_time - start_time)[[3]] # as.double(end_time - start_time)


    if (transpose | RNAseqdata){ # se precisar transpor, col=sample
        rownames(output$proj) = colnames(input)
    }else{
        rownames(output$proj) = rownames(input)
    }
    return(output)

} # end .SWeePliteGeneric






#' @rdname SWeePlite
setMethod("SWeePlite", "AAStringSet",  .SWeePliteFASTAinRAM) 
#' @rdname SWeePlite
setMethod("SWeePlite", "DNAStringSet", .SWeePliteFASTAinRAM) 
#' @rdname SWeePlite
setMethod("SWeePlite", "RNAStringSet", .SWeePliteFASTAinRAM) 
#' @rdname SWeePlite
setMethod("SWeePlite", "BStringSet",   .SWeePliteFASTAinRAM) 
#' @rdname SWeePlite
setMethod("SWeePlite", "BString",      .SWeePliteFASTAinRAM) 
#' @rdname SWeePlite
setMethod("SWeePlite", "character",    .SWeePliteFASTAfromFolder) 
#' @rdname SWeePlite
setMethod("SWeePlite", "array",        .SWeePliteGeneric) # para RNAseq mas 
#' @rdname SWeePlite
setMethod("SWeePlite", "integer",      .SWeePliteGeneric) 
#' @rdname SWeePlite
setMethod("SWeePlite", "matrix",       .SWeePliteGeneric) 
#' @rdname SWeePlite
setMethod("SWeePlite", "dgCMatrix",    .SWeePliteGeneric) # serve tipo array? pois o formato não é reconhecido



























#' @title Generate a orthonormal matrix 
#' @name orthBase
#'
#' @description Generate a orthonormal matrix for specified parameters for ´SWeeP´ function
#'
#' @param lin Number of rows in the desired matrix. 
#' @param col Number of columns in the desired matrix, which means projection size (psz)
#' @param mask      reading mask. Use this option or `lin' option. Default c(2,1,2).
#' @param seqtype   type of data: AA for amino acid, NT for nucleotide. Parameter required if a mask is provided. The default is ´AA´
#' @param seed   provide, if necessary, a seed to generate the matrix. The default is 647474747
#'
#' @return An orthonormal matrix (basis) whose dimensions correspond to the given mask
#'         to be used and a desired projection size (length of the output vector). 
#'         The basis must be supplied to the function \link{SWeeP} (see examples).
#'
#'         `orthBase' returns a `list` containing:
#' \itemize{
#'          \item mat: the orthonormal matrix (basis)
#'          \item seed: the random seed (metadata to identify the matrix)
#'          \item version: the rSWeeP version
#'         }
#'
#' @author Camila P. Perico
#'
#' @seealso \code{\link{SWeeP}}
#' @examples
#' 
#' # define the mask - determines the length of input vector (20^4 = 160000)
#' mask <- c(2,1,2) 
#' 
#' # define the length of output vector
#' psz <- 600
#' 
#' # get the basis matrix to projection
#' Mybase <- orthBase(mask = mask, col = psz,seqtype='AA')
#' 
#' @import methods  utils
#' @export
orthBase <- function(lin=NULL, col,seqtype='AA',mask = c(2,1,2),seed=NULL) {

    if (max(mask)>1){
        mask = convertMask(mask)
    }

    # always the same matrix!
    if(is.null(seed)){
        seed = 647474747
        set.seed(seed) # fixed
    } else{ 
        set.seed(seed)
    }

    if( length(lin) == 0 ){    # if the user give the mask, not the number of lines

        SW.checks('mask',mask)

        if (seqtype=='AA'){
            lin = 20^sum(mask)
        } else if(seqtype=='NT'){
            lin = 4^sum(mask)
        }
    }



    idx = 1:lin
    pslist = c(2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229)
    nps = length(pslist)
    Mproj =  matrix(stats::runif(nps * col), ncol=col)
    xnorm = sqrt(lin/3)

    mret = (matrix(rep(idx,nps),ncol=nps))%%(matrix(rep(pslist,length(idx)),ncol=nps,byrow=T))
    dt = (1+(mret))%*%Mproj
    bs = ((dt - floor(dt))-0.5)/0.5

    output=NULL
    output$mat = bs/xnorm
    output$seed = seed
    output$version = 'SWeeP v2.9'

    return(output)
}

















#' @title Function for obtaining the HDV (High Dimensional Vector) matrix
#' @name extractHDV
#' @description Function for obtaining the HDV matrix without projecting it low dimensional vector (LDV). 
#' Each line of the HDV corresponds to the counting of k-mers of a biological sequence, 
#' organized in a structured way.
#' 
#' @param input There are two input formats available:
#'              (a) `BStringSet' (variants: `AAStringSet', `RNAStringSet', `DNAStringSet'). Biological sequence format loaded in memory;
#'              (b) `character'. String containing a path to a folder with FASTA files.
#' @param mask      readging mask. Default for amino acids is `c(2,1,2)` and for nucleotides c(5,5,5)#' 
#' @param seqtype   type of data: AA for amino acid, NT for nucleotide. The default is `AA`
#' @param bin       binary mode (TRUE), or counting mode (FALSE) for HDV construction. Default is FALSE
#' @param extension         extension of files desired to concatenate (Optional).   Available only for input type path to folder with FASTA files.
#' @param concatenate   defines whether to treat each sequence individually or to concatenate them into a single sequence. 
#'                  Available only for inputs in biological sequence format. The default is FALSE.
#' @param verbose   verbose mode. The default is TRUE
#' @param ...       other arguments of the function itself
#' 
#' @return 
#' `extractHDV' returns a `list` containing:
#' \itemize{
#'   \item HDV: a `matrix' containing the High Dimensional Vectors of the given FASTAS
#'   \item info: aditional information of the process. This object is subdivided in: 
#'   \itemize{
#'       \item headers: a `character' containing the list of samples 
#'       \item mask: a `integer' containing the mask used
#'       \item SequenceType: a `character' containing the type of the sequence (amino acid: AA, ou nucleotide: NT)
#'       \item extension: a `character' containing the list of extensions considered
#'       \item concatenate : a boolean corresponding to the concatenation of sequences
#'       \item bin: a `character' containing if binary or counting
#'       \item version : a character corresponding to the version of the package
#'       \item saturation: a `vector' containing the filled (non-zero) percentage of the HDV for each sample
#'       \item timeElapsed: a `double' containing the elapsed time in seconds
#'    } 
#' } 
#' 
#' @examples
#' 
#' # get the path to the folder containing the FASTA files
#' path = paste (system.file("examples/aaMitochondrial/",package = "rSWeeP"),'/', sep = '')
#' 
#' # define the parameters
#' mask = c(2,1,2)
#' 
#' # get the vectors that represent the sequences in high dimension (without projection)
#' HDV = extractHDV(input=path,mask=mask,seqtype='AA',bin=FALSE,extension=c('.faa','.fas','.fasta'))
#' 
#' 
#' @rdname extractHDV
#' @import foreach doParallel methods  utils
#' @importFrom Biostrings readBStringSet
#' @importFrom Biostrings BStringSet
#' @export
setGeneric(
  "extractHDV",
  function(input, mask=NULL,seqtype='AA',...) standardGeneric("extractHDV"),
    signature = "input"
)
# setMethod("extractHDV", "character", function(path, extension,mask,seqtype,bin,verbose) {

.extractHDVfromFolder <-function(input, mask=NULL,seqtype='AA',bin=FALSE,extension='',verbose=TRUE){

    start_time = proc.time()

    mask = Defmask(mask,seqtype)

    ## Create the lists of files and names
    fastalist = list.files(input,pattern=extension,full.names=TRUE,recursive=TRUE)
    namesfasta = list.files(input,pattern=extension,full.names=FALSE,recursive=TRUE)
    # remove extension from filename
    namesfasta = tools::file_path_sans_ext(namesfasta)   
    
    
    ## input checks ---------------------- vv
    SW.checks('extension',extension)
    SW.checks('mask',mask)
    SW.checks('seqtype',seqtype)
    SW.checks('bin',bin)
    SW.checks('fastalist',fastalist)
    ## input checks ---------------------- ^^

    ## some parameters ---------------------------------
    # spacer between fastas of same individual
    spacer = paste(rep("*",length(mask)),collapse='')
    # define maximum length of HDV
    if (seqtype=='AA'){
        lenmax = 20^sum(mask)
    } else if(seqtype=='NT'){
        lenmax = 4^sum(mask)
    }

    mask = as.integer(mask)

    
    # number of sequences
    N = length(fastalist)
    N_ = as.character(N)


    # empty output  matrix
    output = createSWobjHDV(N,lenmax,namesfasta,mask,bin,seqtype,extension,FALSE)
    

    # for each fasta file, sweep
    for (k in 1:N){
        if(verbose){
            cat(paste('starting sequence ',as.character(k),'of', N_))
        }
        actualfasta = concatenaEach(fastalist[k],spacer)
        seq = seq2num(actualfasta,seqtype)

        output$HDV[k,] = readmaskVEC(seq, mask,seqtype,bin,'none',lenmax)

        # atual sequence
        if(verbose){
            cat(paste(' - complete\n')) 
        }
    }
  

    if(N>1){    output$info$saturation = rowSums(output$HDV!=0)/lenmax    } 
    else {      output$info$saturation = sum(output$HDV!=0)/lenmax    }

    end_time = proc.time()
    output$info$timeElapsed = (end_time - start_time)[[3]] 
    return(output)

}



 .extractHDVinRAM<-function(input, mask=NULL,seqtype=NULL,bin=FALSE,concatenate=FALSE, verbose=TRUE){
    start_time = proc.time()


    if (inherits(input,"AAStringSet"))                                      { seqtype='AA' }
    else if (inherits(input,"RNAStringSet") | inherits(input,"DNAStringSet")) { seqtype='NT' }
    else if (is.null(seqtype)){ 
        stop("Please provide the type of data. Use seqtype='AA' for amino acids and seqtype='NT' for nucleotides. ")
    }
 
    mask = Defmask(mask,seqtype)


    ## input checks ---------------------- vv
    SW.checks('mask',mask)
    SW.checks('seqtype',seqtype)
    SW.checks('bin',bin)
    SW.checks('concatenate',concatenate)
    ## input checks ---------------------- ^^


    if(concatenate){ # concatenate with spacer
        spacer = paste(rep("*",length(mask)),collapse='')
        input = stringi::stri_join_list(list(paste(input)),sep = spacer,collapse = NULL) # require stringi
    }

     # PARAMETERS --------------
    N = length(input) # number of sequences
    N_ = as.character(N)
    lenmax <- if (seqtype == 'AA') 20^sum(mask) else if (seqtype == 'NT') 4^sum(mask)
    mask = as.integer(mask)
    

    # empty output  matrix
    output = createSWobjHDV(N,lenmax,names(input),mask,bin,seqtype,'',concatenate)
    
    for (k in 1:N){
        if(verbose){ cat(paste('starting sequence ',as.character(k),'of', N_)) }

        seq = seq2num(input[k],seqtype)

        output$HDV[k,] = readmaskVEC(seq, mask,seqtype,bin,'none',lenmax)

        # atual sequence
        if(verbose){ cat(paste(' - complete\n')) }

    } # end for k in N
    
  
    if(N>1){    output$info$saturation = rowSums(output$HDV!=0)/lenmax    } 
    else {      output$info$saturation = sum(output$HDV!=0)/lenmax    }

    end_time = proc.time()
    output$info$timeElapsed = (end_time - start_time)[[3]] 

    return(output)

}



#' @rdname extractHDV
setMethod("extractHDV", "AAStringSet",  .extractHDVinRAM) 
#' @rdname extractHDV
setMethod("extractHDV", "DNAStringSet", .extractHDVinRAM) 
#' @rdname extractHDV
setMethod("extractHDV", "RNAStringSet", .extractHDVinRAM) 
#' @rdname extractHDV
setMethod("extractHDV", "BStringSet",   .extractHDVinRAM) 
#' @rdname extractHDV
setMethod("extractHDV", "BString",      .extractHDVinRAM) 
#' @rdname extractHDV
setMethod("extractHDV", "character",    .extractHDVfromFolder) 













# Phylogenetic tree evaluation functions ==============================


#' @title  PhyloTaxonomic Consistency Cophenetic Index
#' @name PCCI
#'
#' @description Phylogenetic tree evaluation function, estimate of how grouped 
#'              the samples of the same taxon are in the phylogenetic tree.
#'
#' @param tr        Phylogenetic tree. If the tree contains sample names in the labels, provide
#'              metadata. If it already contains the names of the taxa, just provide the tree.
#' @param mt    Metadata. The metadata should have the following format: 
#'              the first column should contain the names of the samples, 
#'              exactly as they appear on the tree label; 
#'              the second column should contain the corresponding taxa.
#'              If the tree already has the labels renamed according to the taxon, 
#'              it is not necessary to provide metadata.
#'
#' @return The PCCI index for each taxon and the mean value
#'
#'         `PCCI' returns a `list` containing:
#' \itemize{
#'          \item tab: the PCCI value for each taxon in a two-colunm output: taxa and cost
#'          \item mean: the mean value of PCCI metric
#'         }
#'
#' @author Camila P. Perico
#'
#' @details
#' Empty or NA labels are removed from analyses 
#'
#' @examples
#' 
#' # Load the sample tree and its metadata
#' pathtree <- system.file(package = "rSWeeP" , "examples" , "tree_Mitochondrial.tree")
#' tree = ape::read.tree(pathtree)
#' pathmetadata <- system.file(package = "rSWeeP" , "examples" , "metadata_mitochondrial.csv")
#' mt = read.csv(pathmetadata,header=TRUE)
#' 
#' data = data.frame(sp=mt$fileName,family=mt$family) 
#' PCCI(tree,data)
#' 
#' @import methods  utils stats
#' @export
PCCI <- function(tr,mt=NULL){
  # either provides the branches named by taxon, or provides metadata

  # has metadata?
    if (!is.null(mt)){
        idx = match(tr$tip.label,mt[,1] )
        tr$tip.label = mt[idx,2]        
        
        # labels matches metadata?
        if(sum(is.na(idx)) >0){
            N=sum(is.na(idx))
            print(paste('There are ',N,' names in the metadata that do not match the labels in the tree.'))
        }

    }

    if (is.null(tr$edge.length)){
        tr$edge.length = rep(1,length(tr$edge[,1]))
    }

    return(quebrataxonCOPHE(tr))
}








#' @title  Percentage of Mono or Paraphyletic Groups
#' @name PMPG
#'
#' @description Phylogenetic tree evaluation function, returns the percentage of Monophyletic
#'              and Paraphyletic taxa in the phylogenetic tree.
#'
#' @param tr        Phylogenetic tree. If the tree contains sample names in the labels, provide
#'              metadata. If it already contains the names of the taxa, just provide the tree.
#' @param mt    Metadata. The metadata should have the following format: 
#'              the first column should contain the names of the samples, 
#'              exactly as they appear on the tree label; 
#'              the second column should contain the corresponding taxa.
#'              If the tree already has the labels renamed according to the taxon, 
#'              it is not necessary to provide metadata.
#'
#' @return The `PMPG' returns a `list` containing:
#' \itemize{
#'          \item tab: a dataframe with a three-colunm output: taxa, mono and para. 
#'                     `mono` and `para` columns returns a boolean value. 
#'          \item percMono: percentage of Monophyletic taxa
#'          \item percPara: percentage of Paraphyletic taxa
#'          \item mean: the mean value of `percMono` and `percPara`
#'         }
#'
#' @author Camila P. Perico
#'
#' @details
#' Empty or NA labels are removed from analyses 
#'
#' @examples
#' 
#' # Load the sample tree and its metadata
#' pathtree <- system.file(package = "rSWeeP" , "examples" , "tree_Mitochondrial.tree")
#' tree = ape::read.tree(pathtree)
#' pathmetadata <- system.file(package = "rSWeeP" , "examples" , "metadata_mitochondrial.csv")
#' mt = read.csv(pathmetadata,header=TRUE)
#' 
#' data = data.frame(sp=mt$fileName,family=mt$family) 
#' PMPG(tree,data)
#' 
#' @import methods  utils stats
#' @export
PMPG <- function(tr,mt=NULL){
  # either provides the branches named by taxon, or provides metadata
    if (!is.null(mt)){
        tr$tip.label
        idx = match(tr$tip.label,mt[,1]) 
        tr$tip.label = mt[idx,2]        

        # labels matches metadata?
        if(sum(is.na(idx)) >0){
            N=sum(is.na(idx))
            print(paste('There are ',N,' names in the metadata that do not match the labels in the tree.'))
        }
    }

    return(MonoParaphylMetric(tr))

}