% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/processStudy.R
\encoding{UTF-8}
\name{computeAncestryFromSyntheticFile}
\alias{computeAncestryFromSyntheticFile}
\title{Select the optimal K and D parameters using the synthetic data and
infer the ancestry of a specific profile}
\usage{
computeAncestryFromSyntheticFile(
  gdsReference,
  gdsProfile,
  listFiles,
  currentProfile,
  spRef,
  studyIDSyn,
  np = 1L,
  listCatPop = c("EAS", "EUR", "AFR", "AMR", "SAS"),
  fieldPopIn1KG = "superPop",
  fieldPopInfAnc = "SuperPop",
  kList = seq(2, 15, 1),
  pcaList = seq(2, 15, 1),
  algorithm = c("exact", "randomized"),
  eigenCount = 32L,
  missingRate = NaN,
  verbose = FALSE
)
}
\arguments{
\item{gdsReference}{an object of class \link[gdsfmt]{gds.class} (a GDS
file), the opened 1KG GDS file.}

\item{gdsProfile}{an object of class \code{\link[gdsfmt]{gds.class}}
(a GDS file), the opened Profile GDS file.}

\item{listFiles}{a \code{vector} of \code{character} strings representing
the name of files that contain the results of ancestry inference done on
the synthetic profiles for multiple values of \emph{D} and \emph{K}. The files must
exist.}

\item{currentProfile}{a \code{character} string representing the profile
identifier of the current profile on which ancestry will be inferred.}

\item{spRef}{a \code{vector} of \code{character} strings representing the
known super population ancestry for the 1KG profiles. The 1KG profile
identifiers are used as names for the \code{vector}.}

\item{studyIDSyn}{a \code{character} string corresponding to the study
identifier. The study identifier must be present in the GDS Sample file.}

\item{np}{a single positive \code{integer} representing the number of
threads. Default: \code{1L}.}

\item{listCatPop}{a \code{vector} of \code{character} string
representing the list of possible ancestry assignations. Default:
\code{("EAS", "EUR", "AFR", "AMR", "SAS")}.}

\item{fieldPopIn1KG}{a \code{character} string representing the name of the
column that contains the known ancestry for the reference profiles in
the Reference GDS file.}

\item{fieldPopInfAnc}{a \code{character} string representing the name of
the column that will contain the inferred ancestry for the specified
profiles. Default: \code{"SuperPop"}.}

\item{kList}{a \code{vector} of \code{integer} representing  the list of
values tested for the  \emph{K} parameter. The \emph{K} parameter represents the
number of neighbors used in the K-nearest neighbor analysis. If \code{NULL},
the value \code{seq(2,15,1)} is assigned.
Default: \code{seq(2,15,1)}.}

\item{pcaList}{a \code{vector} of \code{integer} representing  the list of
values tested for the  \emph{D} parameter. The \emph{D} parameter represents the
number of dimensions used in the PCA analysis.  If \code{NULL},
the value \code{seq(2,15,1)} is assigned.
Default: \code{seq(2,15,1)}.}

\item{algorithm}{a \code{character} string representing the algorithm used
to calculate the PCA. The 2 choices are "exact" (traditional exact
calculation) and "randomized" (fast PCA with randomized algorithm
introduced in Galinsky et al. 2016). Default: \code{"exact"}.}

\item{eigenCount}{a single \code{integer} indicating the number of
eigenvectors that will be in the output of the \link[SNPRelate]{snpgdsPCA}
function; if 'eigenCount' <= 0, then all eigenvectors are returned.
Default: \code{32L}.}

\item{missingRate}{a \code{numeric} value representing the threshold
missing rate at with the SNVs are discarded; the SNVs are retained in the
\link[SNPRelate]{snpgdsPCA}
with "<= missingRate" only; if \code{NaN}, no missing threshold.
Default: \code{NaN}.}

\item{verbose}{a \code{logical} indicating if messages should be printed
to show how the different steps in the function. Default: \code{FALSE}.}
}
\value{
a \code{list} containing 4 entries:
\describe{
\item{\code{pcaSample}}{ a \code{list} containing the information related
to the eigenvectors. The \code{list} contains those 3 entries:
\describe{
\item{\code{sample.id}}{ a \code{character} string representing the unique
identifier of the current profile.}
\item{\code{eigenvector.ref}}{ a \code{matrix} of \code{numeric} containing
the eigenvectors for the reference profiles.}
\item{\code{eigenvector}}{ a \code{matrix} of \code{numeric} containing the
eigenvectors for the current profile projected on the PCA from the
reference profiles.}
}
}
\item{\code{paraSample}}{ a \code{list} containing the results with
different \code{D} and \code{K} values that lead to optimal parameter
selection. The \code{list} contains those entries:
\describe{
\item{\code{dfPCA}}{ a \code{data.frame} containing statistical results
on all combined synthetic results done with a fixed value of \code{D} (the
number of dimensions). The \code{data.frame} contains those columns:
\describe{
\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
number of dimensions).}
\item{\code{median}}{ a \code{numeric} representing the median of the
minimum AUROC obtained (within super populations) for all combination of
the fixed \code{D} value and all tested \code{K} values. }
\item{\code{mad}}{ a \code{numeric} representing the MAD of the minimum
AUROC obtained (within super populations) for all combination of the fixed
\code{D} value and all tested \code{K} values. }
\item{\code{upQuartile}}{ a \code{numeric} representing the upper quartile
of the minimum AUROC obtained (within super populations) for all
combination of the fixed \code{D} value and all tested \code{K} values. }
\item{\code{k}}{ a \code{numeric} representing the optimal \code{K} value
(the number of neighbors) for a fixed \code{D} value. }
}
}
\item{\code{dfPop}}{ a \code{data.frame} containing statistical results on
all combined synthetic results done with different values of \code{D} (the
number of dimensions) and \code{K} (the number of neighbors).
The \code{data.frame} contains those columns:
\describe{
\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
number of dimensions).}
\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
number of neighbors).}
\item{\code{AUROC.min}}{ a \code{numeric} representing the minimum accuracy
obtained by grouping all the synthetic results by super-populations, for
the specified values of \code{D} and \code{K}.}
\item{\code{AUROC}}{ a \code{numeric} representing the accuracy obtained
by grouping all the synthetic results for the specified values of \code{D}
and \code{K}.}
\item{\code{Accu.CM}}{ a \code{numeric} representing the value of accuracy
of the confusion matrix obtained by grouping all the synthetic results for
the specified values of \code{D} and \code{K}.}
}
}
\item{\code{dfAUROC}}{ a \code{data.frame} the summary of the results by
super-population. The \code{data.frame} contains
those columns:
\describe{
\item{\code{pcaD}}{ a \code{numeric} representing the value of \code{D} (the
number of dimensions).}
\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
number of neighbors).}
\item{\code{Call}}{ a \code{character} string representing the
super-population.}
\item{\code{L}}{ a \code{numeric} representing the lower value of the 95\%
confidence interval for the AUROC obtained for the fixed values of
super-population, \code{D} and \code{K}.}
\item{\code{AUR}}{ a \code{numeric} representing  the AUROC obtained for the
fixed values of super-population, \code{D} and \code{K}.}
\item{\code{H}}{ a \code{numeric} representing the higher value of the 95\%
confidence interval for the AUROC obtained for the fixed values of
super-population, \code{D} and \code{K}.}
}
}
\item{\code{D}}{ a \code{numeric} representing the optimal \code{D} value
(the number of dimensions) for the specific profile.}
\item{\code{K}}{ a \code{numeric} representing the optimal \code{K} value
(the number of neighbors) for the specific profile.}
\item{\code{listD}}{ a \code{numeric} representing the optimal \code{D}
values (the number of dimensions) for the specific profile. More than one
\code{D} is possible.}
}
}
\item{\code{KNNSample}}{ a \code{list} containing the inferred ancestry
using different \code{D} and \code{K} values. The \code{list} contains
those entries:
\describe{
\item{\code{sample.id}}{ a \code{character} string representing the unique
identifier of the current profile.}
\item{\code{matKNN}}{ a \code{data.frame} containing the inferred ancestry
for different values of \code{K} and \code{D}. The \code{data.frame}
contains those columns:
\describe{
\item{\code{sample.id}}{ a \code{character} string representing the unique
identifier of the current profile.}
\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
number of dimensions) used to infer the ancestry. }
\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
number of neighbors) used to infer the ancestry. }
\item{\code{SuperPop}}{ a \code{character} string representing the inferred
ancestry for the specified \code{D} and \code{K} values.}
}
}
}
}
\item{\code{Ancestry}}{ a \code{data.frame} containing the inferred
ancestry for the current profile. The \code{data.frame} contains those
columns:
\describe{
\item{\code{sample.id}}{ a \code{character} string representing the unique
identifier of the current profile.}
\item{\code{D}}{ a \code{numeric} representing the value of \code{D} (the
number of dimensions) used to infer the ancestry.}
\item{\code{K}}{ a \code{numeric} representing the value of \code{K} (the
number of neighbors) used to infer the ancestry.}
\item{\code{SuperPop}}{ a \code{character} string representing the inferred
ancestry.}
}
}
}
}
\description{
The function select the optimal K and D parameters for a
specific profile. The results on the synthetic data are used for the
parameter selection. Once the optimal parameters are selected, the
ancestry is inferred for the specific profile.
}
\examples{


## Required library
library(gdsfmt)

## Load the known ancestry for the demo 1KG reference profiles
data(demoKnownSuperPop1KG)

## The Reference GDS file
path1KG <- system.file("extdata/tests", package="RAIDS")

## Open the Reference GDS file
gdsRef <- snpgdsOpen(file.path(path1KG, "ex1_good_small_1KG.gds"))

## Path to the demo synthetic results files
## List of the KNN result files from PCA run on synthetic data
dataDirRes <- system.file("extdata/demoAncestryCall/ex1", package="RAIDS")
listFilesName <- dir(file.path(dataDirRes), ".rds")
listFiles <- file.path(file.path(dataDirRes) , listFilesName)

# The name of the synthetic study
studyID <- "MYDATA.Synthetic"

## Path to the demo Profile GDS file is located in this package
dataDir <- system.file("extdata/demoAncestryCall", package="RAIDS")

## Open the Profile GDS file
gdsProfile <- snpgdsOpen(file.path(dataDir, "ex1.gds"))

## Run the ancestry inference on one profile called 'ex1'
## The values of K and D used for the inference are selected using the
## synthetic results
resCall <- computeAncestryFromSyntheticFile(gdsReference=gdsRef,
                            gdsProfile=gdsProfile,
                            listFiles=listFiles,
                            currentProfile=c("ex1"),
                            spRef=demoKnownSuperPop1KG,
                            studyIDSyn=studyID, np=1L)

## The ancestry called with the optimal D and K values
resCall$Ancestry

## Close the GDS files (important)
closefn.gds(gdsProfile)
closefn.gds(gdsRef)


}
\references{
Galinsky KJ, Bhatia G, Loh PR, Georgiev S, Mukherjee S, Patterson NJ,
Price AL. Fast Principal-Component Analysis Reveals Convergent Evolution
of ADH1B in Europe and East Asia. Am J Hum Genet. 2016 Mar 3;98(3):456-72.
doi: 10.1016/j.ajhg.2015.12.022. Epub 2016 Feb 25.
}
\author{
Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
}
