#' @title Compute HMRFHiC Probabilities of Assigning an Interaction to Each Component
#'
#' @description
#' This function computes the posterior probabilities of assigning genomic interactions to each of three mixture components (or states) in a Hidden Markov Random Field (HMRF) model.
#' It uses the posterior means of regression parameters obtained from MCMC simulations and combines these with user-specified distributions (zero-inflated or standard)
#' to produce probabilities for each observed interaction.
#'
#' @usage
#' compute_HMRFHiC_probabilities(data, chain_betas, iterations, dist = "ZINB")
#'
#' @param data A \code{data.frame} containing the following required columns:
#'   \itemize{
#'     \item \code{start}: Genomic start position for locus i.
#'     \item \code{end}: Genomic end position for locus j.
#'     \item \code{interactions}: Observed interaction counts between loci i and j.
#'     \item \code{GC}: GC content or related genomic feature for the interaction.
#'     \item \code{TES}: Transposable Elements data or similar regulatory measure.
#'     \item \code{ACC}: Accessibility measure or another continuous genomic covariate.
#'   }
#'   The \code{start} and \code{end} columns must correspond to the indices of interacting loci (i and j).
#'
#' @param chain_betas A list of MCMC chain results generated by the HMRFHiC model-fitting procedure. Each element should correspond to one chain and must include:
#'   \itemize{
#'     \item \code{chains}: A list of 3 MCMC chains (one for each component), each containing posterior samples of regression coefficients.
#'     \item \code{theta}: (If applicable) Posterior samples of the zero-inflation parameter \eqn{\theta}.
#'     \item \code{size}: (If applicable) Posterior samples for the overdispersion parameter used in Negative Binomial or Zero-Inflated Negative Binomial models.
#'   }
#'
#' @param iterations An integer specifying the number of total MCMC iterations. The function uses half of these as burn-in (i.e., \code{iterations/2}).
#'
#' @param dist A character string indicating the chosen distribution for modeling interaction counts. One of:
#'   \itemize{
#'     \item \code{"ZIP"}: Zero-Inflated Poisson
#'     \item \code{"NB"}: Negative Binomial
#'     \item \code{"Poisson"}: Poisson
#'     \item \code{"ZINB"}: Zero-Inflated Negative Binomial
#'   }
#'   The default is \code{"ZINB"}.
#'
#' @details
#' This function is part of the HMRFHiC pipeline that models genomic interactions (e.g., Hi-C interaction counts) using a mixture model approach.
#' The model typically considers three components (or latent states), each characterized by a distinct mean-structure and potentially different
#' zero-inflation or overdispersion parameters, depending on the chosen distribution.
#'
#' The function:
#' \enumerate{
#'   \item Extracts posterior means of regression parameters from MCMC chains, discarding the initial half of the samples as burn-in.
#'   \item Estimates mean interaction intensities (\eqn{\lambda}) for each component using log-linear models with covariates: distance (log of |end-start|),GC, TES, and ACC (each transformed by a log(\eqn{x+1}) operation).
#'   \item Given the specified distribution (\code{dist}), calculates the probability (on the natural scale) of observing the recorded interaction count for each component.
#'   \item Normalizes these probabilities so that each interaction is assigned a set of three probabilities summing to 1.
#' }
#'
#' For zero-inflated distributions (\code{"ZIP"}, \code{"ZINB"}), a \eqn{\theta} parameter captures the probability of an excess zero.
#' For negative binomial distributions (\code{"NB"}, \code{"ZINB"}), an overdispersion parameter is included.
#'
#' The computed probabilities can be used for downstream analysis, such as segmenting interactions into classes or modeling spatial dependence in a hidden Markov field.
#'
#' @return
#' A \code{data.frame} containing the original input columns plus three new columns:
#' \itemize{
#'   \item \code{prob1}: The posterior probability that the interaction belongs to component 1.
#'   \item \code{prob2}: The posterior probability that the interaction belongs to component 2.
#'   \item \code{prob3}: The posterior probability that the interaction belongs to component 3.
#' }
#'
#' The returned data frame thus provides a probabilistic classification of each observed interaction into one of the three modeled components.
#'
#' @examples
#' #
#' #
#' # Synthetic data
#' set.seed(123)
#' 
#' 
#' large_data <- data.frame(
#'   start = c(1, 10, 20),
#'   end = c(5, 15, 30),
#'   interactions = c(10, 20, 30),
#'   GC = c(0.5, 0.8, 0.3),
#'   TES = c(0.2, 0.5, 0.7),
#'   ACC = c(0.9, 0.4, 0.6)
#' )
#'
#' chain_betas <- list(
#'   list(
#'     chains = list(
#'       matrix(runif(25, 0.1, 1), ncol = 5),
#'       matrix(runif(25, 0.1, 1), ncol = 5),
#'       matrix(runif(25, 0.1, 1), ncol = 5)
#'     ),
#'     theta = runif(5, 0.1, 0.9),
#'     size = matrix(runif(15, 1, 10), nrow = 3)
#'   )
#' )
#'
#' result <- compute_HMRFHiC_probabilities(
#'   data = large_data,
#'   chain_betas = chain_betas,
#'   iterations = 5,
#'   dist = "Poisson"
#' )
#' print(result)
#' # See vignette("HMRFHiC_vignette") for detailed examples with real Hi-C data.
#' #
#' #
#'
#' @seealso
#' \code{\link{dpois}}, \code{\link{dnbinom}}, for probability calculations.
#'
#' @export
#'
compute_HMRFHiC_probabilities <- function(data, chain_betas, iterations, dist = "ZINB") {
  # Check for required columns in data
  .check_required_columns(data)

  # Prepare mydata data frame
  mydata <- as.data.frame(cbind(data$start, data$end, data$interactions, data$GC, data$TES, data$ACC))
  colnames(mydata) <- c("start", "end", "interactions", "GC", "TES", "ACC")

  interactions <- mydata$interactions
  distance <- log(abs(mydata$end - mydata$start) + 1)
  GC <- log(mydata$GC + 1)
  TES <- log(mydata$TES + 1)
  ACC <- log(mydata$ACC + 1)

  # Calculate burn-in
  burnin <- (iterations / 2)

  # Initialize lists to hold means across chains
  intercept_means <- list()
  distance_means <- list()
  GC_means <- list()
  TES_means <- list()
  ACC_means <- list()
  theta_means <- list()
  overdisp_means <- list()

  # Loop over each chain to compute means after burn-in
  intercept_means <- lapply(chain_betas, function(chain) {
    vapply(seq_len(3), function(j) mean(chain[["chains"]][[j]][burnin:iterations, 1, drop = FALSE]), numeric(1))
  })
  
  distance_means <- lapply(chain_betas, function(chain) {
    vapply(seq_len(3), function(j) mean(chain[["chains"]][[j]][burnin:iterations, 2, drop = FALSE]), numeric(1))
  })
  
  GC_means <- lapply(chain_betas, function(chain) {
    vapply(seq_len(3), function(j) mean(chain[["chains"]][[j]][burnin:iterations, 3, drop = FALSE]), numeric(1))
  })
  
  TES_means <- lapply(chain_betas, function(chain) {
    vapply(seq_len(3), function(j) mean(chain[["chains"]][[j]][burnin:iterations, 4, drop = FALSE]), numeric(1))
  })
  
  ACC_means <- lapply(chain_betas, function(chain) {
    vapply(seq_len(3), function(j) mean(chain[["chains"]][[j]][burnin:iterations, 5, drop = FALSE]), numeric(1))
  })
  
  # Calculate theta and overdispersion means conditionally
  if (dist %in% c("ZIP", "ZINB")) {
    theta_means <- lapply(chain_betas, function(chain) {
      mean(chain[["theta"]][burnin:iterations])
    })
  }
  
  if (dist %in% c("NB", "ZINB")) {
    overdisp_means <- lapply(chain_betas, function(chain) {
      vapply(seq_len(3), function(j) mean(chain[["size"]][j, burnin:iterations, drop = FALSE]), numeric(1))
    })
  }

  # Aggregate means across chains
  intercept_1 <- mean(vapply(intercept_means, `[[`, numeric(1), 1))
  distance_1 <- mean(vapply(distance_means, `[[`, numeric(1), 1))
  GC_1 <- mean(vapply(GC_means, `[[`, numeric(1), 1))
  TES_1 <- mean(vapply(TES_means, `[[`, numeric(1), 1))
  ACC_1 <- mean(vapply(ACC_means, `[[`, numeric(1), 1))
  
  intercept_2 <- mean(vapply(intercept_means, `[[`, numeric(1), 2))
  distance_2 <- mean(vapply(distance_means, `[[`, numeric(1), 2))
  GC_2 <- mean(vapply(GC_means, `[[`, numeric(1), 2))
  TES_2 <- mean(vapply(TES_means, `[[`, numeric(1), 2))
  ACC_2 <- mean(vapply(ACC_means, `[[`, numeric(1), 2))
  
  intercept_3 <- mean(vapply(intercept_means, `[[`, numeric(1), 3))
  distance_3 <- mean(vapply(distance_means, `[[`, numeric(1), 3))
  GC_3 <- mean(vapply(GC_means, `[[`, numeric(1), 3))
  TES_3 <- mean(vapply(TES_means, `[[`, numeric(1), 3))
  ACC_3 <- mean(vapply(ACC_means, `[[`, numeric(1), 3))
  
  # Aggregate theta and overdispersion means
  if (dist %in% c("ZIP", "ZINB")) {
    theta <- mean(unlist(theta_means))
  } else {
    theta <- NULL
  }
  if (dist %in% c("NB", "ZINB")) {
    Overdisp_1 <- mean(vapply(overdisp_means, `[[`, numeric(1), 1))
    Overdisp_2 <- mean(vapply(overdisp_means, `[[`, numeric(1), 2))
    Overdisp_3 <- mean(vapply(overdisp_means, `[[`, numeric(1), 3))
  } else {
    Overdisp_1 <- Overdisp_2 <- Overdisp_3 <- NULL
  }

  # Calculate lambda for each component using distance, GC, TES, and ACC
  lambda_1 <- intercept_1 + (distance_1 * distance) + (GC_1 * GC) + (TES_1 * TES) + (ACC_1 * ACC)
  lambda_2 <- intercept_2 + (distance_2 * distance) + (GC_2 * GC) + (TES_2 * TES) + (ACC_2 * ACC)
  lambda_3 <- intercept_3 + (distance_3 * distance) + (GC_3 * GC) + (TES_3 * TES) + (ACC_3 * ACC)

  # Cap interactions values at 500
  interactions_cap <- pmin(interactions, 500)
  # Define combined_prob function within the main function
  combined_prob <- function(y, lambda, theta = NULL, overdisp = NULL, dist = c("ZIP", "NB", "Poisson", "ZINB")) {
    dist <- match.arg(dist) # Match the specified distribution to one of the allowed options
    if (dist == "ZIP") {
      if (is.null(theta)) stop("ZIP requires a non-null theta parameter")
      return(ifelse(y == 0, log(theta + (1 - theta) * exp(-lambda)), log(1 - theta) + dpois(y, lambda = lambda, log = TRUE)))
    } else if (dist == "NB") {
      if (is.null(overdisp)) stop("NB requires a non-null overdisp parameter")
      return(dnbinom(y, size = overdisp, mu = lambda, log = TRUE))
    } else if (dist == "Poisson") {
      return(dpois(y, lambda = lambda, log = TRUE))
    } else if (dist == "ZINB") {
      if (is.null(theta) || is.null(overdisp)) stop("ZINB requires non-null theta and overdisp parameters")
      return(ifelse(y == 0, log(theta + (1 - theta) * exp(-lambda)), log(1 - theta) + dnbinom(y, size = overdisp, mu = lambda, log = TRUE)))
    } else {
      stop("Invalid distribution specified.")
    }
  }

  # Calculate probabilities based on chosen distribution
  # Use main `dist` for the first component
  if (dist %in% c("NB", "ZINB")) {
    prob1 <- combined_prob(interactions_cap, lambda = exp(lambda_1), theta = theta, overdisp = Overdisp_1, dist = dist)
  } else {
    prob1 <- combined_prob(interactions_cap, lambda = exp(lambda_1), theta = theta, dist = dist)
  }

  # Set distribution for second and third components
  dist_comp2_and_3 <- if (dist %in% c("Poisson", "ZIP")) "Poisson" else "NB"

  # Calculate probabilities for the second and third components
  if (dist_comp2_and_3 == "NB") {
    prob2 <- combined_prob(interactions_cap, lambda = exp(lambda_2), overdisp = Overdisp_2, dist = dist_comp2_and_3)
    prob3 <- combined_prob(interactions_cap, lambda = exp(lambda_3), overdisp = Overdisp_3, dist = dist_comp2_and_3)
  } else {
    prob2 <- combined_prob(interactions_cap, theta = theta, lambda = exp(lambda_2), dist = dist_comp2_and_3)
    prob3 <- combined_prob(interactions_cap, theta = theta, lambda = exp(lambda_3), dist = dist_comp2_and_3)
  }

  # Convert log probabilities to normal scale
  prob1 <- exp(prob1)
  prob2 <- exp(prob2)
  prob3 <- exp(prob3)

  # Normalize probabilities to sum to 1
  prob_sum <- prob1 + prob2 + prob3
  prob11 <- prob1 / prob_sum
  prob22 <- prob2 / prob_sum
  prob33 <- prob3 / prob_sum

  # Add probabilities to mydata data frame
  mydata$prob1 <- as.vector(prob11)
  mydata$prob2 <- as.vector(prob22)
  mydata$prob3 <- as.vector(prob33)

  # Return the modified mydata data frame
  return(mydata)
}
