% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ancestry.R
\name{ancestry_prediction}
\alias{ancestry_prediction}
\title{Predicting sample superpopulation ancestry}
\usage{
ancestry_prediction(
  indir,
  qcdir,
  name,
  verbose = FALSE,
  interactive = FALSE,
  path2plink2 = NULL,
  path2load_mat = NULL,
  legend_text_size = 5,
  legend_title_size = 7,
  axis_text_size = 5,
  axis_title_size = 7,
  title_size = 9,
  showPlinkOutput = TRUE,
  legend_position = "right",
  keep_individuals = NULL,
  remove_individuals = NULL,
  exclude_markers = NULL,
  extract_markers = NULL,
  plink2format = FALSE,
  var_format = FALSE,
  excludeAncestry = NULL,
  do.run_ancestry_prediction = TRUE,
  do.evaluate_ancestry_prediction = TRUE
)
}
\arguments{
\item{indir}{[character] /path/to/directory containing the basic PLINK 1.9 data
file name.bim, name.fam, name.bed}

\item{qcdir}{[character] /path/to/directory where the plink2 data formations
as returned by plink2 --make-pgen will be saved to. User needs writing 
permission to qcdir. Per default is qcdir=indir.}

\item{name}{[character] Prefix of PLINK 1.9 files, i.e. name.bim, name.fam, 
name.bed}

\item{verbose}{[logical] If TRUE, progress info is printed to standard out.}

\item{interactive}{[logical] Should plots be shown interactively? When
choosing this option, make sure you have X-forwarding/graphical interface
available for interactive plotting. Alternatively, set interactive=FALSE and
save the returned plot object (p_ancestry) via ggplot2::ggsave(p=p_ancestry,
other_arguments) or pdf(outfile) print(p_ancestry) dev.off().}

\item{path2plink2}{[character] Absolute path to PLINK executable
(\url{https://www.cog-genomics.org/plink/2.0/}) i.e.
plink 2 should be accessible as path2plink -h. The full name of the executable
should be specified: for windows OS, this means path/plink.exe, for unix
platforms this is path/plink. If not provided, assumed that PATH set-up works
and PLINK will be found by \code{\link[sys]{exec}}('plink').}

\item{path2load_mat}{[character] /path/to/directory where loading matrices are 
kept. This can be downloaded from the github repo. Note that the name of the file 
before the .eigenvec.allele or .acount must be included in file path.}

\item{legend_text_size}{[integer] Size for legend text.}

\item{legend_title_size}{[integer] Size for legend title.}

\item{axis_text_size}{[integer] Size for axis text.}

\item{axis_title_size}{[integer] Size for axis title.}

\item{title_size}{[integer] Size for plot title.}

\item{showPlinkOutput}{[logical] If TRUE, plink log and error messages are
printed to standard out.}

\item{legend_position}{[character] Legend position for the plot.}

\item{keep_individuals}{[character] Path to file with individuals to be
retained in the analysis. The file has to be a space/tab-delimited text file
with family IDs in the first column and within-family IDs in the second
column. All samples not listed in this file will be removed from the current
analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#indiv}.
Default: NULL, i.e. no filtering on individuals.}

\item{remove_individuals}{[character] Path to file with individuals to be
removed from the analysis. The file has to be a space/tab-delimited text file
with family IDs in the first column and within-family IDs in the second
column. All samples listed in this file will be removed from the current
analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#indiv}.
Default: NULL, i.e. no filtering on individuals.}

\item{exclude_markers}{[character] Path to file with makers to be
removed from the analysis. The file has to be a text file with a list of
variant IDs (usually one per line, but it's okay for them to just be
separated by spaces). All listed variants will be removed from the current
analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#snp}.
Default: NULL, i.e. no filtering on markers.}

\item{extract_markers}{[character] Path to file with makers to be
included in the analysis. The file has to be a text file with a list of
variant IDs (usually one per line, but it's okay for them to just be
separated by spaces). All unlisted variants will be removed from the current
analysis. See \url{https://www.cog-genomics.org/plink/1.9/filter#snp}.
Default: NULL, i.e. no filtering on markers.}

\item{plink2format}{[logical] If TRUE, data is in plink2 format already and 
convert_to_plink2 will not be run}

\item{var_format}{[logical] If TRUE, variant identifiers are in correct 
format already and rename_variant_identifiers will not be run}

\item{excludeAncestry}{[character] Ancestries to be excluded (if any). Options are:
Africa, America, Central_South_Asia, East_Asia, Europe, and Middle_East. Strings 
must be spelled exactly as shown.}

\item{do.run_ancestry_prediction}{[logical] If TRUE, run
\code{\link{run_ancestry_prediction}}.}

\item{do.evaluate_ancestry_prediction}{[logical] If TRUE, run
\code{\link{evaluate_ancestry_prediction}}.}
}
\value{
Three dataframes and a visualization of the ancestral probabilities. 
prediction_prob contains the sample IDs and ancestral probabilities from the model.
prediction_majority contains the sample IDs and greatest ancestry probabilities 
from the model. exclude_ancestry contains the list of sample ids with ancestries
to be excluded. p_ancestry contains a plot visualizing the ancestry probabilities 
in a bargraph.
}
\description{
Predicts the ancestry of inputted samples using plink2. Projects the samples
on to the principal components of the reference dataset and inputs it into
a random forest classifier to identify the ancestry.
}
\examples{
indir <- system.file("extdata", package="plinkQC")
qcdir <- tempdir()
name <- "data.hg38"
path2plink <- '/path/to/plink'
path2load_mat <- '/path/to/load_mat/merged_chrs.postQC.train.pca'
\dontrun{
# the following code is not run on package build, as the path2plink on the
# user system is not known.
ancestry_prediction(indir=indir, qcdir=qcdir, name=name, 
path2plink2 = path2plink2, path2load_mat = path2load_mat)
}
}
