### =========================================================================
### Handling of igblastn() command-line arguments
### -------------------------------------------------------------------------
###
### Nothing in this file is exported.
###


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .normarg_germline_db_X() and .normarg_c_region_db()
###

### The 'germline_db_[VDJ]' and 'c_region_db' arguments of the 'igblastn'
### standalone executable must be "database names" or, more precisely,
### paths to local "blast dbs".
### Note that the path to a local "blast db" must be supplied as something
### that **looks** like the path to a local file, except that, instead of
### pointing to an existing file, it points to a collection of files that
### makes up the "blast db". More precisely, the path to a local "blast db"
### looks like this:
###   1. Its dirname() part must point to the directory where the "blast db"
###      is located.
###   2. Its basename() part must be the name of an existing "blast db"
###      located in the dirname() directory. This "blast db" consists of
###      a collection of 10 binary files that were usually produced by
###      the 'makeblastdb' standalone executable. All these files are
###      expected to have a name of the form <bdb-name>.<suffix> where
###      <bdb-name> is the name of the "blast db" and <suffix> a 3 letter
###      suffix (see .BLASTDB_SUFFIXES in R/make_blastdbs.R for the list
###      of all known suffixes).
.normalize_bdb_path <- function(bdb_path, region_type)
{
    stopifnot(isSingleNonWhiteString(bdb_path))
    if (dir.exists(bdb_path)) {
        ## 'bdb_path' contains no "blast db" name so we default
        ## to 'region_type' (i.e. V, D, J, or C) for the "blast db" name.
        bdb_dirname <- bdb_path
        bdb_name <- region_type
    } else {
        bdb_dirname <- dirname(bdb_path)
        if (!dir.exists(bdb_dirname))
            stop(wmsg(bdb_dirname, ": no such directory"))
        bdb_name <- basename(bdb_path)
    }
    bdb_dirname <- file_path_as_absolute(bdb_dirname)
    file.path(bdb_dirname, bdb_name)
}

.normarg_germline_db_X <- function(germline_db_X, region_type)
{
    stopifnot(isSingleNonWhiteString(region_type))
    argname <- paste0("germline_db_", region_type)
    what <- paste0(region_type, "-region db")
    if (!isSingleNonWhiteString(germline_db_X))
        stop(wmsg("'", argname, "' must be \"auto\" or a single string ",
                  "containing the path to a ", what))
    if (germline_db_X != "auto")
        return(.normalize_bdb_path(germline_db_X, region_type))
    db_name <- use_germline_db()  # cannot be ""
    db_path <- germline_db_path(db_name)
    file.path(db_path, region_type)
}

### Can return a NULL.
.normarg_c_region_db <- function(c_region_db)
{
    if (is.null(c_region_db))
        return(NULL)
    what <- "C-region db"
    if (!isSingleNonWhiteString(c_region_db))
        stop(wmsg("'c_region_db' must be \"auto\", NULL, or a single string ",
                  "containing the path to a ", what))
    if (c_region_db != "auto")
        return(.normalize_bdb_path(c_region_db, "C"))
    db_name <- use_c_region_db()  # can be ""
    if (db_name == "")
        return(NULL)
    db_path <- c_region_db_path(db_name)
    file.path(db_path, "C")
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .normarg_seqidlist()
###

.normarg_seqidlist_file <- function(seqidlist)
{
    stopifnot(inherits(seqidlist, "file"))
    path <- summary(seqidlist)$description
    ## Check that the file exists and is readable.
    res <- try(suppressWarnings(open(seqidlist, open="r")), silent=TRUE)
    if (inherits(res, "try-error"))
        stop(wmsg("unable to open file '", path, "' for reading"))
    close(seqidlist)
    path
}

.normarg_seqidlist_character <- function(seqidlist)
{
    stopifnot(is.character(seqidlist))
    if (anyNA(seqidlist))
        stop(wmsg("character vector 'seqidlist' contains NAs"))
    seqidlist <- trimws2(seqidlist)
    if (!all(nzchar(seqidlist)))
        stop(wmsg("character vector 'seqidlist' contains ",
                  "empty or white strings"))
    path <- tempfile()
    attr(path, "safe_to_remove") <- TRUE
    writeLines(seqidlist, path)
    path
}

.germline_db_is_internal <- function(db_path)
{
    stopifnot(isSingleNonWhiteString(db_path), dir.exists(db_path))
    db_path <- file_path_as_absolute(db_path)
    dirname(db_path) == get_germline_dbs_home()
}

.region_db_is_internal <- function(region_db_path)
{
    stopifnot(isSingleNonWhiteString(region_db_path))
    .germline_db_is_internal(dirname(region_db_path)) &&
        basename(region_db_path) %in% VDJ_REGION_TYPES
}

### Assumes that 'region_db_path' is pointing to an **internally** managed
### region db.
.make_obtain_valid_seqids_Rcode <- function(region_db_path)
{
    stopifnot(isSingleNonWhiteString(region_db_path))
    db_name <- basename(dirname(region_db_path))
    region_type <- basename(region_db_path)
    code <- sprintf("load_germline_db(\"%s\", region_types=\"%s\")",
                    db_name, region_type)
    sprintf("names(%s)", code)
}

.check_user_seqids <- function(user_seqids, region_db_path, region_type)
{
    fasta_file <- paste0(region_db_path, ".fasta")
    sequences <- readDNAStringSet(fasta_file)
    valid_seqids <- names(sequences)
    invalid_seqids <- setdiff(user_seqids, valid_seqids)
    if (length(invalid_seqids) != 0L) {
        what <- paste0(region_type, "-region db")
        in1string <- paste0(invalid_seqids, collapse=", ")
        msg1 <- c("Sequence id(s) not found in ", what, ": ", in1string)
        msg2a <- "Note that you can use:"
        code <- .make_obtain_valid_seqids_Rcode(region_db_path)
        msg2b <- "to obtain the list of valid germline sequence ids."
        stop(wmsg(msg1), "\n  ",
             wmsg(msg2a), "\n    ", code, "\n  ", wmsg(msg2b))
    }
}

### Can return a NULL.
.normarg_seqidlist <- function(seqidlist, region_db_path,
                               region_type=VDJ_REGION_TYPES)
{
    region_type <- match.arg(region_type)
    if (is.null(seqidlist))
        return(NULL)
    if (inherits(seqidlist, "file")) {
        path <- .normarg_seqidlist_file(seqidlist)
    } else if (is.character(seqidlist)) {
        path <- .normarg_seqidlist_character(seqidlist)
    } else {
        stop(wmsg("'seqidlist' must be NULL, a 'file' object, ",
                  "or a character vector"))
    }
    user_seqids <- readLines(path)
    ## We only check the user-supplied sequence ids if the FASTA file
    ## associated with 'region_db_path' is guaranteed to exist and
    ## be in sync with the region db itself, which is only the case
    ## if 'region_db_path' points to an **internally** managed region db.
    if (.region_db_is_internal(region_db_path))
        .check_user_seqids(user_seqids, region_db_path, region_type)
    path
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .normarg_custom_internal_data()
###

.check_user_supplied_custom_internal_data <-
    function(custom_internal_data, germline_db_V)
{
    if (!file.exists(custom_internal_data))
        stop(wmsg("custom FWR/CDR annotation file '",
                  custom_internal_data, "' does not exist"))
    intdata <- try(read_V_ndm_data(custom_internal_data), silent=TRUE)
    if (inherits(intdata, "try-error"))
        stop(wmsg("custom FWR/CDR annotation file '",
                  custom_internal_data, "' cannot be loaded"))
    if (!all(check_V_ndm_data(intdata, allow.dup.entries=TRUE)))
        warning(wmsg("some V alleles in custom FWR/CDR annotation file '",
                     custom_internal_data, "' are not properly annotated"),
                immediate.=TRUE)

    ## We only perform this last check if the FASTA file
    ## associated with 'germline_db_V' is guaranteed to exist and
    ## be in sync with the V-region db itself, which is only the case
    ## if 'germline_db_V' points to an **internally** managed region db.
    if (.region_db_is_internal(germline_db_V)) {
        V_fasta_file <- paste0(germline_db_V, ".fasta")
        V_names <- names(readDNAStringSet(V_fasta_file))
        if (!all(V_names %in% intdata[ , "allele_name"])) {
            ## Note that 'basename(dirname(germline_db_V))' is not absolutely
            ## guaranteed to be the same as the name of the selected germline
            ## db so don't use use_germline_db() here.
            db_name <- basename(dirname(germline_db_V))
            warning(wmsg("not all V alleles in germline db ", db_name, " are ",
                         "annotated in custom FWR/CDR annotation file '",
                         custom_internal_data, "'"),
                    immediate.=TRUE)
        }
    }
}

.get_auto_custom_internal_data <- function(num_auto_germline_dbs, domain_system)
{
    ## If the user supplied all 3 germline_db_X arguments, then it makes no
    ## sense to try and use the internal data included in the cached germline
    ## db that is currently selected. Note that use_germline_db() is not even
    ## guaranteed to work because there's no guarantee that the user has
    ## selected a germline db yet.
    if (num_auto_germline_dbs == 0L)
        return(NULL)
    db_name <- use_germline_db()  # cannot be ""
    intdata_dir <- file.path(germline_db_path(db_name), "internal_data")
    if (!dir.exists(intdata_dir))
        return(NULL)
    intdata_filename <- paste0("V.ndm.", domain_system)
    intdata_path <- file.path(intdata_dir, intdata_filename)
    if (!file.exists(intdata_path)) {
        warning(wmsg("internal data file ", intdata_filename, " ",
                     "not found in germline db ", db_name, " --> ",
                     "using IgBLAST internal data from NCBI instead"),
                immediate.=TRUE)
        return(NULL)
    }
    intdata_path
}

### Can return a NULL.
.normarg_custom_internal_data <- function(custom_internal_data="auto",
                                          germline_db_V,
                                          num_auto_germline_dbs, domain_system)
{
    if (is.null(custom_internal_data))
        return(NULL)
    if (!isSingleNonWhiteString(custom_internal_data))
        stop(wmsg("'custom_internal_data' must be \"auto\", or a single ",
                  "string that is the path to a file containing custom ",
                  "FWR/CDR annotation, or NULL"))
    if (custom_internal_data != "auto") {
        .check_user_supplied_custom_internal_data(custom_internal_data,
                                                  germline_db_V)
        return(file_path_as_absolute(custom_internal_data))
    }
    .get_auto_custom_internal_data(num_auto_germline_dbs, domain_system)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .normarg_organism()
###

### Is it really true that "any specified organism will be ignored" if
### the -custom_internal_data parameter is supplied, as they claim here?
### https://ncbi.github.io/igblast/cook/How-to-set-up.html (see Procedure
### to use custom FWR/CDR annotation). But then how is igblastn supposed to
### find the auxiliary data?
.normarg_organism <- function(organism="auto", num_auto_germline_dbs,
                              custom_internal_data)
{
    if (!isSingleNonWhiteString(organism))
        stop(wmsg("'organism' must be a single (non-empty) string"))
    if (organism != "auto") {
        #if (!is.null(custom_internal_data))
        #    warning(wmsg("the specified 'organism' (", organism, ") is ",
        #                 "ignored when 'custom_internal_data' is specified"),
        #            immediate.=TRUE)
        return(normalize_igblast_organism(organism))
    }
    #if (!is.null(custom_internal_data))
    #    return(NULL)
    if (num_auto_germline_dbs == 0L)
        stop(wmsg("'organism' must be specified when 'germline_db_V', ",
                  "'germline_db_D', and 'germline_db_J' are supplied"))
    germline_db_name <- use_germline_db()  # cannot be ""
    organism <- infer_organism_shortname_from_db_name(germline_db_name)
    if (is.na(organism))
        stop(wmsg("Don't know how to infer 'organism' from germline ",
                  "db name \"", germline_db_name, "\". Please set ",
                  "the 'organism' argument to the name of the IgBLAST ",
                  "internal data to use. ",
                  "Use list_igblast_organisms() to list all valid names."))
    organism
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .normarg_auxiliary_data()
###

### Can return a NULL.
.normarg_auxiliary_data <- function(auxiliary_data="auto", organism)
{
    if (is.null(auxiliary_data))
        return(NULL)
    if (!isSingleNonWhiteString(auxiliary_data))
        stop(wmsg("'auxiliary_data' must be \"auto\", or a single string ",
                  "that is the path to a file containing the coding frame ",
                  "start positions for the sequences in the J-region db",
                  "or NULL"))
    if (auxiliary_data == "auto")
        return(get_auxdata_path(organism))
    file_path_as_absolute(auxiliary_data)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .normarg_ig_seqtype()
###

.infer_ig_seqtype_from_db_name <- function(germline_db_name)
{
    stopifnot(isSingleNonWhiteString(germline_db_name))
    if (grepl(".IG", germline_db_name, fixed=TRUE))
        return("Ig")
    if (grepl(".TR", germline_db_name, fixed=TRUE))
        return("TCR")
    stop(wmsg("Don't know how to infer 'ig_seqtype' from germline ",
              "db name \"", germline_db_name, "\". Please set ",
              "the 'ig_seqtype' argument to \"Ig\" or \"TCR\"."))
}

.normarg_ig_seqtype <- function(ig_seqtype="auto", num_auto_germline_dbs)
{
    err_msg <- "'ig_seqtype' must be \"auto\", \"Ig\", or \"TCR\""
    if (!isSingleNonWhiteString(ig_seqtype))
        stop(wmsg(err_msg))
    if (ig_seqtype != "auto") {
        t <- c(Ig="ig", Ig="bcr", TCR="tcr")
        m <- pmatch(tolower(ig_seqtype), t)
        if (is.na(m))
            stop(wmsg(err_msg))
        return(names(t)[[m]])
    }
    if (num_auto_germline_dbs == 0L)
        stop(wmsg("'ig_seqtype' must be specified when 'germline_db_V', ",
                  "'germline_db_D', and 'germline_db_J' are supplied"))
    germline_db_name <- use_germline_db()  # cannot be ""
    .infer_ig_seqtype_from_db_name(germline_db_name)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .extra_args_as_named_character()
###

### Turns extra args into a named character vector with no NAs.
.extra_args_as_named_character <- function(...)
{
    xargs <- list(...)
    if (length(xargs) == 0L)
        return(setNames(character(0), character(0)))
    xargs_names <- names(xargs)
    if (is.null(xargs_names) || !all(nzchar(xargs_names)))
        stop(wmsg("extra arguments must be named"))
    if (any(has_whitespace(xargs_names)))
        stop(wmsg("argument names cannot contain whitespaces"))
    dupidx <- anyDuplicated(xargs_names)
    if (dupidx != 0L)
        stop(wmsg("argument '", xargs_names[[dupidx]], "' ",
                  "is defined more than once"))
    ## as.character() seems to ba able to handle any list, even nested ones,
    ## and to always return a character vector parallel to the input list.
    ## That is, it always produces a character vector with one element per
    ## top-level element in the input list. Also it doesn't propagate NAs as
    ## such but turns them into the "NA" string instead.
    setNames(as.character(xargs), xargs_names)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .check_igblastn_extra_args()
###

### 'xargs' must be a named character vector as returned by
### .extra_args_as_named_character() above.
.check_igblastn_extra_args <- function(xargs)
{
    stopifnot(is.character(xargs), !anyNA(xargs))
    xargs_names <- names(xargs)
    stopifnot(!is.null(xargs_names), !anyDuplicated(xargs_names))

    if (FALSE) {
      todo_url <- "https://github.com/HyrienLab/igblastr/blob/devel/TODO"

      ## Check 'ig_seqtype' arg (see TODO file for the details).
      idx <- match("ig_seqtype", xargs_names)
      if (!is.na(idx)) {
        ig_seqtype <- xargs[[idx]]
        if (ig_seqtype == "TCR")
            stop(wmsg("'ig_seqtype=\"TCR\"' is not supported ",
                      "at the moment (see ", todo_url, ")"))
      }
    }
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .normarg_out()
###

### Must return an **absolute** path.
.normarg_out <- function(out)
{
    if (is.null(out)) {
        out <- tempfile("igblastn_out_", fileext=".txt")
        attr(out, "safe_to_remove") <- TRUE
        return(out)
    }
    if (!isSingleNonWhiteString(out))
        stop(wmsg("'out' must be NULL or a single (non-empty) string"))
    dirpath <- dirname(out)
    if (!dir.exists(dirpath))
        stop(wmsg(dirpath, ": no such directory"))
    dirpath <- file_path_as_absolute(dirpath)
    out <- file.path(dirpath, basename(out))
    if (file.exists(out))
        stop(wmsg(out, ": file exists"))
    out
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### make_igblastn_command_line_args()
###

### Returns the arguments in a named list of single strings where the names
### are valid igblastn command line argument names (e.g. "organism") and the
### valid argument values (e.g. "rabbit").
### Note that the reason we return a list and not a character vector is
### because we sometimes need to put attributes on some of the strings
### e.g. sometimes we need to put the "safe_to_remove" attribute on
### the 'germline_db_[VDJ]_seqidlist' arguments.
make_igblastn_command_line_args <-
    function(query, outfmt="AIRR",
             germline_db_V="auto", germline_db_V_seqidlist=NULL,
             germline_db_D="auto", germline_db_D_seqidlist=NULL,
             germline_db_J="auto", germline_db_J_seqidlist=NULL,
             organism="auto", c_region_db="auto",
             custom_internal_data="auto", auxiliary_data="auto",
             domain_system=c("imgt", "kabat"), ig_seqtype="auto",
             ...,
             out=NULL)
{
    stopifnot(isSingleNonWhiteString(query),
              isSingleNonWhiteString(outfmt))

    ## It will only make sense to infer the organism from the cached
    ## germline db returned by use_germline_db() if the user didn't
    ## supply all 3 germline_db_X arguments, that is,
    ## if 'num_auto_germline_dbs' != 0.
    num_auto_germline_dbs <- identical(germline_db_V, "auto") +
                             identical(germline_db_D, "auto") +
                             identical(germline_db_J, "auto")

    germline_db_V <- .normarg_germline_db_X(germline_db_V, "V")
    germline_db_V_seqidlist <- .normarg_seqidlist(germline_db_V_seqidlist,
                                                  germline_db_V, "V")
    germline_db_D <- .normarg_germline_db_X(germline_db_D, "D")
    germline_db_D_seqidlist <- .normarg_seqidlist(germline_db_D_seqidlist,
                                                  germline_db_D, "D")
    germline_db_J <- .normarg_germline_db_X(germline_db_J, "J")
    germline_db_J_seqidlist <- .normarg_seqidlist(germline_db_J_seqidlist,
                                                  germline_db_J, "J")
    c_region_db <- .normarg_c_region_db(c_region_db)
    domain_system <- match.arg(domain_system)
    custom_internal_data <- .normarg_custom_internal_data(custom_internal_data,
                                                          germline_db_V,
                                                          num_auto_germline_dbs,
                                                          domain_system)
    organism <- .normarg_organism(organism, num_auto_germline_dbs,
                                  custom_internal_data)
    auxiliary_data <- .normarg_auxiliary_data(auxiliary_data, organism)
    ig_seqtype <- .normarg_ig_seqtype(ig_seqtype, num_auto_germline_dbs)

    cmd_args <- list(query=query, outfmt=outfmt,
                     germline_db_V=germline_db_V,
                     germline_db_V_seqidlist=germline_db_V_seqidlist,
                     germline_db_D=germline_db_D,
                     germline_db_D_seqidlist=germline_db_D_seqidlist,
                     germline_db_J=germline_db_J,
                     germline_db_J_seqidlist=germline_db_J_seqidlist,
                     organism=organism,
                     c_region_db=c_region_db,
                     custom_internal_data=custom_internal_data,
                     auxiliary_data=auxiliary_data,
                     domain_system=domain_system,
                     ig_seqtype=ig_seqtype)

    xargs <- .extra_args_as_named_character(...)
    .check_igblastn_extra_args(xargs)

    out <- .normarg_out(out)

    S4Vectors:::delete_NULLs(c(cmd_args, as.list(xargs), list(out=out)))
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### make_exe_args()
###

### 'cmd_args' must be a named list of single strings character as
### returned by make_igblastn_command_line_args() above.
### Returns an **unnamed** character vector parallel to 'cmd_args'.
make_exe_args <- function(cmd_args)
{
    stopifnot(is.list(cmd_args), all(lengths(cmd_args) == 1L))
    args_names <- names(cmd_args)
    stopifnot(!is.null(args_names))
    cmd_args <- as.character(cmd_args)
    ## For now we only put quotes around arguments that contain a space.
    ## Should we preventively quote all arguments, just in case?
    quoteme_idx <- grep(" ", cmd_args, fixed=TRUE)
    if (length(quoteme_idx) != 0L)
        cmd_args[quoteme_idx] <- shQuote(cmd_args[quoteme_idx])
    paste0("-", args_names, " ", cmd_args)
}

