% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sarks.R
\name{clusterKmers}
\alias{clusterKmers}
\title{Cluster k-mers}
\usage{
clusterKmers(kmers, k = 4, nClusters = NULL, maxClusters = NULL,
    directional = TRUE)
}
\arguments{
\item{kmers}{character vector or XStringSet of k-mers to partition
into clusters}

\item{k}{length of sub-k-mers (default k=4 to use tetramers) with
which to calculate Jaccard distances for clustering}

\item{nClusters}{number of clusters to partition kmers into; if set
to NULL (default value), selects number of clusters to maximize
the average silhouette score
(\url{https://en.wikipedia.org/wiki/Silhouette_(clustering)}).}

\item{maxClusters}{if nClusters not specified, can optionally set
maximum number of clusters allowed in silhouette score
optimization.}

\item{directional}{logical value: if FALSE, considers each kmer as
equivalent to its reverse-complement. Makes sense only if
applying to DNA sequences!}
}
\value{
list of character vectors (or XStringSet objects as per the
    class of kmers argument) partitioning kmers into clusters: the
    character vector at the i-th element of the output list
    contains the elements from kmers assigned to cluster i.
}
\description{
Takes a set of k-mer sequences and returns a list of partitioning
the input k-mers into clusters of more similar k-mers. Hierarchical
clustering (average linkage) is performed based on Jaccard
coefficient distance metric applied treating each k-mer as the set
of all tetramers which can be found as substrings within it.
}
\examples{
kmers <- c(
    'CAGCCTGG', 'CCTGGAA', 'CAGCCTG', 'CCTGGAAC', 'CTGGAACT',
    'ACCTGC', 'CACCTGC', 'TGGCCTG', 'CACCTG', 'TCCAGC',
    'CTGGAAC', 'CACCTGG', 'CTGGTCTA', 'GTCCTG', 'CTGGAAG', 'TTCCAGC'
)
clusterKmers(kmers, directional=FALSE)

}
