% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/supervised_scores.R
\name{supervised_substitutions}
\alias{supervised_substitutions}
\alias{supervised_metrics}
\title{Load Semi-Supervised Model Predictions for Substitutions in 217 Assays}
\usage{
supervised_substitutions(
  metadata = FALSE,
  fold_scheme = c("contiguous", "modulo", "random")
)

supervised_metrics(metadata = FALSE)
}
\arguments{
\item{metadata}{Logical, whether only experiment metadata should be returned.
Default behavior is to return processed data with metadata included.}

\item{fold_scheme}{Character, which validation folding scheme to load.
Options include: "contiguous", "modulo", or "random". Default behavior loads
"contiguous". For more information about the different folding schemes,
refer to the original publication.}
}
\value{
Returns a \code{\link[=list]{list()}} object of 217 individual assays.

Returns a \code{\link[=data.frame]{data.frame()}} with 7 columns.
}
\description{
Load Semi-Supervised Model Predictions for Substitutions in 217 Assays

Load Semi-Supervised Model Summary Metrics
}
\details{
\code{supervised_substitutions()} loads prediction scores outputted by
semi-supervised models run on the 217 DMS substitution assays.

For raw model predictions, each assay includes 18 columns:
\describe{
\item{\code{UniProt_id}:}{Character, UniProt accession identifier.}
\item{\code{DMS_id}:}{Character, ProteinGym assay identifier.}
\item{\code{mutant}:}{Character, set of substitutions to apply on the
reference sequence to obtain the mutated sequence (e.g., A1P:D2N implies
the amino acid 'A' at position 1 should be replaced by 'P', and 'D' at
position 2 should be replaced by 'N').}
\item{\code{mutated_sequence}:}{Character, full amino acid sequence for the
mutated protein.}
\item{\code{DMS_score}:}{Numeric, experimental measurement in the DMS assay.
Higher values indicate higher fitness of the mutated protein.}
\item{\code{DMS_score_bin}:}{Factor, indicates whether the DMS_score is
above the fitness cutoff (1 is fit, 0 is not fit).}
\item{\code{Columns 7:18}:}{Respective semi-supervised model name.}
}

\code{supervised_metrics()} loads in model performance summary metrics
("Spearman" and "MSE") from semi-supervised models in ProteinGymR run on
217 DMS substitution assays.

A metric summary table with 7 columns:
\describe{
\item{\code{UniProt_id}:}{Character, UniProt accession identifier.}
\item{\code{DMS_id}:}{Character, ProteinGym assay identifier.}
\item{\code{mutant}:}{Character, set of substitutions to apply on the
reference sequence to obtain the mutated sequence (e.g., A1P:D2N implies
the amino acid 'A' at position 1 should be replaced by 'P', and 'D' at
position 2 should be replaced by 'N').}
\item{\code{model_name}:}{Character, semi-supervised model used.}
\item{\code{fold_variable_name}:}{Character, the folding scheme used.}
\item{\code{Spearman}:}{Numeric, Spearman performance metric.}
\item{\code{MSE}:}{Numeric, MSE the Spearman performance metric.}
}
}
\examples{
data <- supervised_substitutions()
data_random <- supervised_substitutions(fold_scheme = "random")
meta <- supervised_substitutions(metadata = TRUE)

data <- supervised_metrics()
meta <- supervised_metrics(metadata = TRUE)

}
\references{
Notin, P., Kollasch, A., Ritter, D., van Niekerk, L., Paul, S., Spinner, H.,
Rollins, N., Shaw, A., Orenbuch, R., Weitzman, R., Frazer, J., Dias, M.,
Franceschi, D., Gal, Y., & Marks, D. (2023). ProteinGym: Large-Scale
Benchmarks for Protein Fitness Prediction and Design. In A. Oh, T. Neumann,
A. Globerson, K. Saenko, M. Hardt, & S. Levine (Eds.), Advances in Neural
Information Processing Systems (Vol. 36, pp. 64331-64379).
Curran Associates, Inc.
}
