% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/stat-dens1d-labels.r
\name{stat_dens1d_labels}
\alias{stat_dens1d_labels}
\title{Replace labels in data based on 1D density}
\usage{
stat_dens1d_labels(
  mapping = NULL,
  data = NULL,
  geom = "text",
  position = "identity",
  ...,
  keep.fraction = 0.1,
  keep.number = Inf,
  keep.sparse = TRUE,
  keep.these = FALSE,
  exclude.these = FALSE,
  these.target = "label",
  pool.along = c("x", "none"),
  xintercept = 0,
  invert.selection = FALSE,
  bw = "SJ",
  kernel = "gaussian",
  adjust = 1,
  n = 512,
  orientation = c("x", "y"),
  label.fill = "",
  return.density = FALSE,
  na.rm = TRUE,
  show.legend = FALSE,
  inherit.aes = TRUE
)
}
\arguments{
\item{mapping}{The aesthetic mapping, usually constructed with
\code{\link[ggplot2]{aes}} or \code{\link[ggplot2]{aes_}}. Only needs to be
set at the layer level if you are overriding the plot defaults.}

\item{data}{A layer specific dataset - only needed if you want to override
the plot defaults.}

\item{geom}{The geometric object to use display the data.}

\item{position}{The position adjustment to use for overlapping points on this
layer}

\item{...}{other arguments passed on to \code{\link[ggplot2]{layer}}. This
can include aesthetics whose values you want to set, not map. See
\code{\link[ggplot2]{layer}} for more details.}

\item{keep.fraction}{numeric vector of length 1 or 2 [0..1]. The fraction of
the observations (or rows) in \code{data} to be retained.}

\item{keep.number}{integer vector of length 1 or 2. Set the maximum number of
observations to retain, effective only if obeying \code{keep.fraction}
would result in a larger number.}

\item{keep.sparse}{logical If \code{TRUE}, the default, observations from the
more sparse regions are retained, if \code{FALSE} those from the densest
regions.}

\item{keep.these, exclude.these}{character vector, integer vector, logical
vector or function that takes one or more variables in data selected by
\code{these.target}. Negative integers behave as in R's extraction methods.
The rows from \code{data} indicated by \code{keep.these} and
\code{exclude.these} are kept or excluded irrespective of the local
density.}

\item{these.target}{character, numeric or logical selecting one or more
column(s) of \code{data}. If \code{TRUE} the whole \code{data} object is
passed.}

\item{pool.along}{character, one of \code{"none"} or \code{"x"},
indicating if selection should be done pooling the observations along the
\emph{x} aesthetic, or separately on either side of \code{xintercept}.}

\item{xintercept}{numeric The split point for the data filtering.}

\item{invert.selection}{logical If \code{TRUE}, the complement of the
selected rows are returned.}

\item{bw}{numeric or character The smoothing bandwidth to be used. If
numeric, the standard deviation of the smoothing kernel. If character, a
rule to choose the bandwidth, as listed in \code{\link[stats]{bw.nrd}}.}

\item{kernel}{character See \code{\link{density}} for details.}

\item{adjust}{numeric A multiplicative bandwidth adjustment. This makes it
possible to adjust the bandwidth while still using the a bandwidth
estimator through an argument passed to \code{bw}. The larger the value
passed to \code{adjust} the stronger the smoothing, hence decreasing
sensitivity to local changes in density.}

\item{n}{numeric Number of equally spaced points at which the density is to
be estimated for applying the cut point. See \code{\link{density}} for
details.}

\item{orientation}{character The aesthetic along which density is computed.
Given explicitly by setting orientation to either "x" or "y".}

\item{label.fill}{character vector of length 1 or a function.}

\item{return.density}{logical vector of lenght 1. If \code{TRUE} add columns
\code{"density"} and \code{"keep.obs"} to the returned data frame.}

\item{na.rm}{a logical value indicating whether NA values should be stripped
before the computation proceeds.}

\item{show.legend}{logical. Should this layer be included in the legends?
\code{NA}, the default, includes if any aesthetics are mapped. \code{FALSE}
never includes, and \code{TRUE} always includes.}

\item{inherit.aes}{If \code{FALSE}, overrides the default aesthetics, rather
than combining with them. This is most useful for helper functions that
define both data and aesthetics and shouldn't inherit behaviour from the
default plot specification, e.g. \code{\link[ggplot2]{borders}}.}
}
\value{
A plot layer instance. Using as output \code{data} the input
  \code{data} after value substitution based on a 1D the filtering criterion.
}
\description{
\code{stat_dens1d_labels()} Sets values mapped to the
  \code{label} aesthetic to \code{""} or a user provided character string
  based on the local density in regions of a plot panel. Its main use is
  together with repulsive geoms from package \code{\link[ggrepel]{ggrepel}}
  to restrict labeling to the low density tails of a distribution. By default
  the data are handled all together, but it is also possible to control
  labeling separately in each tail.

  If there is no mapping to \code{label} in \code{data}, the mapping is set
  to \code{rownames(data)}, with a message.
}
\details{
\code{stat_dens1d_labels()} is designed to work together with
  geometries from package 'ggrepel'. To avoid text labels being plotted over
  unlabelled points the corresponding rows in data need to be retained but
  labels replaced with the empty character string, \code{""}. Function
  \code{\link{stat_dens1d_filter}} cannot be used with the repulsive geoms
  from 'ggrepel' because it drops the observations.

  \code{stat_dens1d_labels()} can be useful also in other situations, as the
  substitution character string can be set by the user by passing an argument
  to \code{label.fill}. If this argument is \code{NULL} the unselected rows
  are filtered out.

  The local density of observations along \emph{x} or \emph{y} is computed
  with function \code{\link[stats]{density}} and used to select observations,
  passing to the geom all the rows in its \code{data} input but with with the
  text of labels replaced in those "not kept". The default is to select
  observations in sparse regions of the plot, but the selection can be
  inverted so that only observations in the densest regions are returned.
  Specific observations can be protected from having the label replaced by
  passing a suitable argument to \code{keep.these}. Logical and integer
  vectors function as indexes to rows in \code{data}, while a character
  vector is compared to values in the variable mapped to the \code{label}
  aesthetic. A function passed as argument to keep.these will receive as
  argument the values in the variable mapped to \code{label} and should
  return a character, logical or numeric vector as described above.

  How many labels are retained intact in addition to those in
  \code{keep.these} is controlled with arguments passed to \code{keep.number}
  and \code{keep.fraction}. \code{keep.number} sets the maximum number of
  observations selected, whenever \code{keep.fraction} results in fewer
  observations selected, it is obeyed. If \code{xintercept} is a finite value
  within the \emph{x} range of the data and \code{pool.along} is passed
  \code{"none"} the data are split into two groups and \code{keep.number} and
  \code{keep.fraction} are applied separately to each tail with density still
  computed jointly from all observations. If the length of \code{keep.number}
  and \code{keep.fraction} is one, half this value is used each tail, if
  their length is two, the first value is use for the left tail and the
  second value for the right tail (or if using \code{orientation = "y"} the
  lower and upper tails, respectively).

  Computation of density and of the default bandwidth require at least
  two observations with different values. If data do not fulfill this
  condition, they are kept only if \code{keep.fraction = 1}. This is correct
  behavior for a single observation, but can be surprising in the case of
  multiple observations.

  Parameters \code{keep.these} and \code{exclude.these} make it possible to
  force inclusion or exclusion of labels after the density is computed.
  In case of conflict, \code{exclude.these} overrides \code{keep.these}.
}
\note{
Which points are kept and which not depends on how dense and flexible
  is the density curve estimate. This depends on the values passed as
  arguments to parameters \code{n}, \code{bw} and \code{kernel}. It is
  also important to be aware that both \code{geom_text()} and
  \code{geom_text_repel()} can avoid overplotting by discarding labels at
  the plot rendering stage, i.e., what is plotted may differ from what is
  returned by this statistic.
}
\examples{

random_string <-
  function(len = 6) {
    paste(sample(letters, len, replace = TRUE), collapse = "")
  }

# Make random data.
set.seed(1005)
d <- tibble::tibble(
  x = rnorm(100),
  y = rnorm(100),
  group = rep(c("A", "B"), c(50, 50)),
  lab = replicate(100, { random_string() })
)

# using defaults
ggplot(data = d, aes(x, y, label = lab)) +
  geom_point() +
  stat_dens1d_labels()

ggrepel.installed <- requireNamespace("ggrepel", quietly = TRUE)
if (ggrepel.installed) {
  library(ggrepel)

# using defaults
  ggplot(data = d, aes(x, y, label = lab)) +
    geom_point() +
    stat_dens1d_labels(geom = "text_repel")

# if no mapping to label is found, it is set row names
  ggplot(data = d, aes(x, y)) +
    geom_point() +
    stat_dens1d_labels(geom = "text_repel")

  ggplot(data = d, aes(x, y)) +
    geom_point() +
    stat_dens1d_labels(geom = "text_repel", pool.along = "none")

  ggplot(data = d, aes(x, y)) +
    geom_point() +
    stat_dens1d_labels(geom = "text_repel",
                       keep.number = c(0, 10), pool.along = "none")

  ggplot(data = d, aes(x, y)) +
    geom_point() +
    stat_dens1d_labels(geom = "text_repel",
                       keep.fraction = c(0, 0.2), pool.along = "none")

# using defaults, along y-axis
  ggplot(data = d, aes(x, y, label = lab)) +
    geom_point() +
    stat_dens1d_labels(orientation = "y", geom = "text_repel")

# example labelling with coordiantes
  ggplot(data = d, aes(x, y, label = sprintf("x = \%.2f\ny = \%.2f", x, y))) +
    geom_point() +
    stat_dens1d_filter(colour = "red") +
    stat_dens1d_labels(geom = "text_repel", colour = "red", size = 3)

  ggplot(data = d, aes(x, y, label = lab, colour = group)) +
    geom_point() +
    stat_dens1d_labels(geom = "text_repel")

  ggplot(data = d, aes(x, y, label = lab, colour = group)) +
    geom_point() +
    stat_dens1d_labels(geom = "text_repel", label.fill = NA)

# we keep labels starting with "a" across the whole plot, but all in sparse
# regions. To achieve this we pass as argument to label.fill a fucntion
# instead of a character string.
  label.fun <- function(x) {ifelse(grepl("^a", x), x, "")}
  ggplot(data = d, aes(x, y, label = lab, colour = group)) +
    geom_point() +
    stat_dens1d_labels(geom = "text_repel", label.fill = label.fun)
}

# Using geom_debug_group() we can see that all 100 rows in \code{d} are
# returned.
gginnards.installed <- requireNamespace("gginnards", quietly = TRUE)
if (gginnards.installed) {
  library(gginnards)

  ggplot(data = d, aes(x, y, label = lab)) +
    geom_point() +
    stat_dens1d_labels(geom = "debug_group")
}
if (gginnards.installed) {
  ggplot(data = d, aes(x, y, label = lab)) +
    geom_point() +
    stat_dens1d_labels(geom = "debug_group", return.density = TRUE)
}

}
\seealso{
\code{\link[stats]{density}} used internally.

Other statistics returning a subset of data: 
\code{\link{stat_dens1d_filter}()},
\code{\link{stat_dens2d_filter}()},
\code{\link{stat_dens2d_labels}()}
}
\concept{statistics returning a subset of data}
