% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/table2matrix.R
\name{table2matrix}
\alias{table2matrix}
\title{table2matrix}
\usage{
table2matrix(
  x,
  unifyMatrix = FALSE,
  letter.convert = TRUE,
  greek2text = FALSE,
  correctComma = FALSE,
  replicate = FALSE,
  repNums = FALSE,
  rm.html = FALSE,
  rm.empty.row.col = FALSE,
  collapseHeader = TRUE,
  header2colnames = FALSE
)
}
\arguments{
\item{x}{A file path to a DOCX, PDF, or HTML encoded file, or text with HTML code.}

\item{unifyMatrix}{Logical. If TRUE, matrix cells are unified for better post-processing (see '?unifyMatrixContent').}

\item{letter.convert}{Logical. If TRUE, html and hexadecimal encoded letters will be unified and converted to Unicode with html2unicode() and JATSdecoder::letter.convert().}

\item{greek2text}{Logical. If TRUE and 'letter.convert=TRUE', converts and unifies various Greek letters to a text-based form (e.g.: 'alpha', 'beta').}

\item{correctComma}{Logical. If TRUE, commas used as decimal are converted to dots, big mark commas are removed.}

\item{replicate}{Logical. If TRUE, the content of cells with row/col span > 1 is replicated in all connected cells; if FALSE, the value will only be placed in the first of the connected cells.}

\item{repNums}{Logical. If TRUE, cells with numbers that have row/col span > 1 are replicated in every connected cell.}

\item{rm.html}{Logical. If TRUE, all HTML tags are removed, except <sub> and <sup>, and </break> is converted to space.}

\item{rm.empty.row.col}{Logical. If TRUE, empty rows/columns are removed from output.}

\item{collapseHeader}{Logical. If TRUE, header cells are collapsed for each column if the header has 2 or more lines.}

\item{header2colnames}{Logical. If TRUE and 'collapseHeader=TRUE', the first table row is used for column names and removed from the table.}
}
\value{
List with detected tables as character matrices.
}
\description{
Extracts tables from HTML, HML, XML, DOCX, PDF files, or plain HTML code to a list of character matrices.
}
\examples{
## - Download example DOCX file
d<-'https://github.com/ingmarboeschen/tableParser/raw/refs/heads/main/tableExamples.docx'
download.file(d,paste0(tempdir(),"/","tableExamples.docx"))

# Extract tables from example file as matrices
table2matrix(paste0(tempdir(),"/","tableExamples.docx"))

## - Download example HTML file
h<-'https://github.com/ingmarboeschen/tableParser/raw/refs/heads/main/tableExamples.html'
download.file(h,paste0(tempdir(),"/","tableExamples.html"))

# Extract tables from example file as matrices
table2matrix(paste0(tempdir(),"/","tableExamples.html"),rm.html=TRUE)

## - Download example PDF file
p<-'https://github.com/ingmarboeschen/tableParser/raw/refs/heads/main/tableExamples.pdf'
download.file(p,paste0(tempdir(),"/","tableExamples.pdf"))

# Extract tables from example file as matrices
\donttest{
table2matrix(paste0(tempdir(),"/","tableExamples.pdf"))

# Note: The extraction of tables within PDF documents with tabulapdf::extract_tables()  
# does not work properly here. 
# Also, the table captions and footnotes cannot be used for decoding (e.g., p-values). 

tabulapdf::extract_tables(paste0(tempdir(),"/","tableExamples.pdf"))
}

## Another example with a website that contains simple and nested HTML-tables

# download file
x<-readLines("https://en.wikipedia.org/wiki/R_(programming_language)",warn=FALSE)

# apply function
table2matrix(x,rm.html=TRUE,unifyMatrix=TRUE)
}
