mod commands;
mod environments;
mod metadata;
mod parser;
mod utilities;
use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::plugins::{DocumentExtractor, Plugin};
use crate::types::builder::DocumentStructureBuilder;
use crate::types::document_structure::{AnnotationKind, DocumentStructure, TextAnnotation};
use crate::types::{ExtractionResult, Metadata, Table};
use async_trait::async_trait;
use std::sync::LazyLock;
use parser::LatexParser;
use utilities::{collect_environment, extract_env_name, extract_heading_title};
static HEADING_LEVELS_WITH_CHAPTERS: LazyLock<ahash::AHashMap<&'static str, u8>> = LazyLock::new(|| {
let mut m = ahash::AHashMap::with_capacity(10);
m.insert("chapter", 1);
m.insert("chapter*", 1);
m.insert("section", 2);
m.insert("section*", 2);
m.insert("subsection", 3);
m.insert("subsection*", 3);
m.insert("subsubsection", 4);
m.insert("subsubsection*", 4);
m.insert("paragraph", 5);
m.insert("paragraph*", 5);
m
});
static HEADING_LEVELS_NO_CHAPTERS: LazyLock<ahash::AHashMap<&'static str, u8>> = LazyLock::new(|| {
let mut m = ahash::AHashMap::with_capacity(8);
m.insert("section", 1);
m.insert("section*", 1);
m.insert("subsection", 2);
m.insert("subsection*", 2);
m.insert("subsubsection", 3);
m.insert("subsubsection*", 3);
m.insert("paragraph", 4);
m.insert("paragraph*", 4);
m
});
pub struct LatexExtractor;
impl LatexExtractor {
pub fn new() -> Self {
Self
}
fn extract_from_latex(content: &str) -> (String, Metadata, Vec<Table>) {
let mut parser = LatexParser::new(content);
parser.parse()
}
fn strip_inline_commands(input: &str) -> (String, Vec<TextAnnotation>) {
let mut output = String::with_capacity(input.len());
let mut annotations = Vec::new();
let bytes = input.as_bytes();
let len = bytes.len();
let mut pos = 0;
while pos < len {
if bytes[pos] == b'\\' {
if let Some((kind, content, new_pos)) = Self::try_parse_inline_command(&input[pos..]) {
let start = output.len() as u32;
let (inner_text, inner_anns) = Self::strip_inline_commands(&content);
output.push_str(&inner_text);
let end = output.len() as u32;
for mut ann in inner_anns {
ann.start += start;
ann.end += start;
annotations.push(ann);
}
if start < end {
annotations.push(TextAnnotation { start, end, kind });
}
pos += new_pos;
continue;
}
output.push('\\');
pos += 1;
} else {
output.push(input[pos..].chars().next().unwrap());
pos += input[pos..].chars().next().unwrap().len_utf8();
}
}
(output, annotations)
}
fn try_parse_inline_command(text: &str) -> Option<(AnnotationKind, String, usize)> {
let commands: &[(&str, AnnotationKind)] = &[
("\\textbf{", AnnotationKind::Bold),
("\\emph{", AnnotationKind::Italic),
("\\textit{", AnnotationKind::Italic),
("\\underline{", AnnotationKind::Underline),
("\\texttt{", AnnotationKind::Code),
];
for (prefix, kind) in commands {
if let Some(after) = text.strip_prefix(prefix)
&& let Some((content, consumed)) = Self::read_braced_content(after)
{
return Some((kind.clone(), content, prefix.len() + consumed));
}
}
if let Some(after_href) = text.strip_prefix("\\href{")
&& let Some((url, url_consumed)) = Self::read_braced_content(after_href)
{
let after_url = &after_href[url_consumed..];
if let Some(after_brace) = after_url.strip_prefix('{')
&& let Some((link_text, text_consumed)) = Self::read_braced_content(after_brace)
{
let total = "\\href{".len() + url_consumed + 1 + text_consumed;
return Some((AnnotationKind::Link { url, title: None }, link_text, total));
}
}
None
}
fn read_braced_content(input: &str) -> Option<(String, usize)> {
let mut depth: u32 = 1;
let mut content = String::new();
let mut pos = 0;
let bytes = input.as_bytes();
while pos < bytes.len() {
let ch = input[pos..].chars().next()?;
let ch_len = ch.len_utf8();
match ch {
'{' => {
depth += 1;
content.push(ch);
}
'}' => {
depth -= 1;
if depth == 0 {
return Some((content, pos + ch_len));
}
content.push(ch);
}
_ => content.push(ch),
}
pos += ch_len;
}
None
}
fn extract_includegraphics_path(line: &str) -> Option<String> {
let prefix = "\\includegraphics";
let start = line.find(prefix)?;
let after = &line[start + prefix.len()..];
let rest = if after.starts_with('[') {
let bracket_end = after.find(']')?;
&after[bracket_end + 1..]
} else {
after
};
if !rest.starts_with('{') {
return None;
}
let inner = &rest[1..];
let end = inner.find('}')?;
let path = inner[..end].trim();
if path.is_empty() { None } else { Some(path.to_string()) }
}
fn extract_caption(content: &str) -> Option<String> {
let prefix = "\\caption{";
let start = content.find(prefix)?;
let after = &content[start + prefix.len()..];
Self::read_braced_content(after).map(|(text, _)| text)
}
fn build_document_structure(source: &str) -> DocumentStructure {
let mut builder = DocumentStructureBuilder::new().source_format("latex");
let lines: Vec<&str> = source.lines().collect();
let mut in_document = false;
let is_plain_tex = source.contains("\\bye") && !source.contains("\\begin{document}");
if is_plain_tex {
in_document = true;
}
let has_chapters = source.contains("\\chapter{") || source.contains("\\chapter*{");
let heading_map = if has_chapters {
&*HEADING_LEVELS_WITH_CHAPTERS
} else {
&*HEADING_LEVELS_NO_CHAPTERS
};
let mut metadata_entries: Vec<(String, String)> = Vec::new();
for &cmd in &["title", "author", "date"] {
if let Some(value) = utilities::extract_braced(source, cmd)
&& !value.is_empty()
{
metadata_entries.push((cmd.to_string(), value));
}
}
if !metadata_entries.is_empty() {
builder.push_metadata_block(metadata_entries, None);
}
let mut i = 0;
while i < lines.len() {
let trimmed = lines[i].trim();
if is_plain_tex && trimmed.contains("\\bye") {
break;
}
if !is_plain_tex && trimmed.contains("\\begin{document}") {
in_document = true;
i += 1;
continue;
}
if !is_plain_tex && trimmed.contains("\\end{document}") {
break;
}
if !in_document {
i += 1;
continue;
}
if (trimmed.contains("\\begin{") || trimmed.contains("\\begin {"))
&& let Some(env_name) = extract_env_name(trimmed)
{
match env_name.as_str() {
"itemize" | "enumerate" | "description" => {
let ordered = env_name == "enumerate";
let (env_content, new_i) = collect_environment(&lines, i, &env_name);
let list_idx = builder.push_list(ordered, None);
Self::build_list_items(&mut builder, &env_content, list_idx);
i = new_i;
continue;
}
"tabular" => {
let (env_content, new_i) = collect_environment(&lines, i, "tabular");
let cells = Self::parse_tabular_cells(&env_content);
if !cells.is_empty() {
builder.push_table_from_cells(&cells, None);
}
i = new_i;
continue;
}
"table" => {
let (env_content, new_i) = collect_environment(&lines, i, "table");
let caption = Self::extract_caption(&env_content);
let end_tag = "\\end{tabular}";
if env_content.contains("\\begin{tabular}")
&& let Some(start) = env_content.find("\\begin{tabular}")
&& let Some(end) = env_content.find(end_tag)
{
let tabular_content = &env_content[start..end + end_tag.len()];
let inner_lines: Vec<&str> = tabular_content.lines().collect();
let (inner_content, _) = collect_environment(&inner_lines, 0, "tabular");
let cells = Self::parse_tabular_cells(&inner_content);
if !cells.is_empty() {
let idx = builder.push_table_from_cells(&cells, None);
if let Some(cap) = caption {
let mut attrs = ahash::AHashMap::new();
attrs.insert("caption".to_string(), cap);
builder.set_attributes(idx, attrs);
}
}
}
i = new_i;
continue;
}
"figure" => {
let (env_content, new_i) = collect_environment(&lines, i, "figure");
let caption = Self::extract_caption(&env_content);
if let Some(path) = Self::extract_includegraphics_path(&env_content) {
let idx = builder.push_image(Some(&path), None, None, None);
if let Some(cap) = caption {
let mut attrs = ahash::AHashMap::new();
attrs.insert("caption".to_string(), cap);
builder.set_attributes(idx, attrs);
}
}
i = new_i;
continue;
}
"equation" | "equation*" | "align" | "align*" | "gather" | "gather*" | "multline" | "multline*"
| "eqnarray" | "eqnarray*" | "math" | "displaymath" | "flalign" | "flalign*" | "cases" => {
let (env_content, new_i) = collect_environment(&lines, i, &env_name);
let formula_text = format!("\\begin{{{}}}\n{}\\end{{{}}}", env_name, env_content, env_name);
builder.push_formula(&formula_text, None);
i = new_i;
continue;
}
"lstlisting" | "verbatim" | "minted" => {
let (env_content, new_i) = collect_environment(&lines, i, &env_name);
let language = if env_name == "lstlisting" || env_name == "minted" {
Self::extract_code_language(trimmed)
} else {
None
};
builder.push_code(env_content.trim(), language, None);
i = new_i;
continue;
}
_ => {
let (_, new_i) = collect_environment(&lines, i, &env_name);
i = new_i;
continue;
}
}
}
let mut handled = false;
if let Some(after_backslash) = trimmed.strip_prefix('\\') {
let cmd_end = after_backslash
.find(|c: char| c == '{' || c == '[' || c.is_whitespace())
.unwrap_or(after_backslash.len());
let cmd_name = &after_backslash[..cmd_end];
if let Some(&level) = heading_map.get(cmd_name) {
let rest = &after_backslash[cmd_end..];
let rest = rest.trim_start();
if rest.starts_with('{') || rest.starts_with('[') {
if let Some(title) = extract_heading_title(trimmed, cmd_name) {
builder.push_heading(level, &title, None, None);
}
handled = true;
}
}
}
if !handled && !trimmed.is_empty() && !trimmed.starts_with('%') {
if trimmed.contains("\\includegraphics")
&& let Some(path) = Self::extract_includegraphics_path(trimmed)
{
builder.push_image(Some(&path), None, None, None);
i += 1;
continue;
}
if trimmed.starts_with("\\[") {
let mut math_content = trimmed.to_string();
if !trimmed.contains("\\]") {
i += 1;
while i < lines.len() {
math_content.push('\n');
math_content.push_str(lines[i]);
if lines[i].trim().contains("\\]") {
break;
}
i += 1;
}
}
let formula = math_content.trim_start_matches("\\[").trim_end_matches("\\]").trim();
if !formula.is_empty() {
builder.push_formula(formula, None);
}
} else if trimmed.contains('$') && !trimmed.starts_with('\\') {
let (text, annotations) = Self::strip_inline_commands(trimmed);
builder.push_paragraph(&text, annotations, None, None);
} else if !trimmed.starts_with('\\')
|| trimmed.starts_with("\\textbf")
|| trimmed.starts_with("\\emph")
|| trimmed.starts_with("\\textit")
|| trimmed.starts_with("\\underline")
|| trimmed.starts_with("\\texttt")
|| trimmed.starts_with("\\href")
{
let mut line_text = trimmed.to_string();
while let Some(fn_start) = line_text.find("\\footnote{") {
let after = &line_text[fn_start + "\\footnote{".len()..];
if let Some((fn_text, consumed)) = Self::read_braced_content(after) {
let fn_stripped = utilities::clean_text(&fn_text);
if !fn_stripped.is_empty() {
builder.push_footnote(&fn_stripped, None);
}
let end = fn_start + "\\footnote{".len() + consumed;
line_text = format!("{}{}", &line_text[..fn_start], &line_text[end..]);
} else {
break;
}
}
let line_text = line_text.trim();
if !line_text.is_empty() {
let (text, annotations) = Self::strip_inline_commands(line_text);
builder.push_paragraph(&text, annotations, None, None);
}
}
}
i += 1;
}
builder.build()
}
fn parse_tabular_cells(content: &str) -> Vec<Vec<String>> {
let mut rows = Vec::new();
for line in content.lines() {
let trimmed = line.trim();
if trimmed.starts_with("\\hline")
|| trimmed.is_empty()
|| trimmed.contains("\\begin{tabular}")
|| trimmed.contains("\\end{tabular}")
{
continue;
}
let row_str = trimmed.replace("\\\\", "");
let cells: Vec<String> = row_str
.split('&')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
if !cells.is_empty() {
rows.push(cells);
}
}
rows
}
fn build_list_items(
builder: &mut DocumentStructureBuilder,
content: &str,
list_idx: crate::types::document_structure::NodeIndex,
) {
for line in content.lines() {
let trimmed = line.trim();
if trimmed.starts_with("\\item") {
let after = trimmed.strip_prefix("\\item").unwrap_or("").trim();
let text = if after.starts_with('[') {
if let Some(bracket_end) = after.find(']') {
let label = &after[1..bracket_end];
let rest = after[bracket_end + 1..].trim();
format!("{}: {}", label, rest)
} else {
after.to_string()
}
} else {
after.to_string()
};
if !text.is_empty() {
builder.push_list_item(list_idx, &text, None);
}
}
}
}
fn extract_code_language(begin_line: &str) -> Option<&str> {
if let Some(lang_pos) = begin_line.find("language=") {
let after = &begin_line[lang_pos + 9..];
let end = after.find([',', ']', '}']).unwrap_or(after.len());
let lang = after[..end].trim();
if !lang.is_empty() {
return Some(lang);
}
}
if begin_line.contains("minted")
&& let Some(brace_start) = begin_line.rfind('{')
{
let after = &begin_line[brace_start + 1..];
if let Some(brace_end) = after.find('}') {
let lang = after[..brace_end].trim();
if !lang.is_empty() && lang != "minted" {
return Some(lang);
}
}
}
None
}
}
impl Default for LatexExtractor {
fn default() -> Self {
Self::new()
}
}
impl Plugin for LatexExtractor {
fn name(&self) -> &str {
"latex-extractor"
}
fn version(&self) -> String {
env!("CARGO_PKG_VERSION").to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
fn description(&self) -> &str {
"Native Rust LaTeX document extractor with metadata and table support"
}
fn author(&self) -> &str {
"Kreuzberg Team"
}
}
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
impl DocumentExtractor for LatexExtractor {
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
config: &ExtractionConfig,
) -> Result<ExtractionResult> {
let latex_str = String::from_utf8_lossy(content).to_string();
let (text, metadata, tables) = Self::extract_from_latex(&latex_str);
let document = if config.include_document_structure {
Some(Self::build_document_structure(&latex_str))
} else {
None
};
Ok(ExtractionResult {
content: text,
mime_type: mime_type.to_string().into(),
metadata,
tables,
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
pages: None,
elements: None,
ocr_elements: None,
document,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
children: None,
})
}
fn supported_mime_types(&self) -> &[&str] {
&["application/x-latex", "text/x-tex"]
}
fn priority(&self) -> i32 {
50
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::NodeContent;
#[test]
fn test_basic_title_extraction() {
let latex = r#"\title{Hello World}"#;
let (_, metadata, _) = LatexExtractor::extract_from_latex(latex);
assert_eq!(
metadata.additional.get("title").and_then(|v| v.as_str()),
Some("Hello World")
);
}
#[test]
fn test_author_extraction() {
let latex = r#"\author{John Doe}"#;
let (_, metadata, _) = LatexExtractor::extract_from_latex(latex);
assert!(metadata.additional.contains_key("author"));
}
#[test]
fn test_section_extraction() {
let latex = r#"\begin{document}\section{Introduction}\end{document}"#;
let (content, _, _) = LatexExtractor::extract_from_latex(latex);
assert!(content.contains("Introduction"));
}
#[test]
fn test_strip_inline_bold() {
let (text, anns) = LatexExtractor::strip_inline_commands("hello \\textbf{world} end");
assert_eq!(text, "hello world end");
assert_eq!(anns.len(), 1);
assert!(matches!(anns[0].kind, AnnotationKind::Bold));
assert_eq!(&text[anns[0].start as usize..anns[0].end as usize], "world");
}
#[test]
fn test_strip_inline_italic_variants() {
let (text, anns) = LatexExtractor::strip_inline_commands("\\emph{a} and \\textit{b}");
assert_eq!(text, "a and b");
assert_eq!(anns.len(), 2);
assert!(anns.iter().all(|a| matches!(a.kind, AnnotationKind::Italic)));
}
#[test]
fn test_strip_inline_underline_code() {
let (text, anns) = LatexExtractor::strip_inline_commands("\\underline{u} \\texttt{c}");
assert_eq!(text, "u c");
assert!(anns.iter().any(|a| matches!(a.kind, AnnotationKind::Underline)));
assert!(anns.iter().any(|a| matches!(a.kind, AnnotationKind::Code)));
}
#[test]
fn test_strip_inline_nested() {
let (text, anns) = LatexExtractor::strip_inline_commands("\\textbf{\\emph{nested}}");
assert_eq!(text, "nested");
assert_eq!(anns.len(), 2);
assert!(anns.iter().any(|a| matches!(a.kind, AnnotationKind::Bold)));
assert!(anns.iter().any(|a| matches!(a.kind, AnnotationKind::Italic)));
}
#[test]
fn test_strip_inline_href() {
let (text, anns) = LatexExtractor::strip_inline_commands("see \\href{https://example.com}{link text} here");
assert_eq!(text, "see link text here");
assert_eq!(anns.len(), 1);
match &anns[0].kind {
AnnotationKind::Link { url, .. } => assert_eq!(url, "https://example.com"),
_ => panic!("expected Link annotation"),
}
assert_eq!(&text[anns[0].start as usize..anns[0].end as usize], "link text");
}
#[test]
fn test_strip_no_commands() {
let (text, anns) = LatexExtractor::strip_inline_commands("plain text only");
assert_eq!(text, "plain text only");
assert!(anns.is_empty());
}
#[test]
fn test_extract_includegraphics_path() {
assert_eq!(
LatexExtractor::extract_includegraphics_path("\\includegraphics[width=5cm]{img/photo.png}"),
Some("img/photo.png".to_string())
);
assert_eq!(
LatexExtractor::extract_includegraphics_path("\\includegraphics{simple.jpg}"),
Some("simple.jpg".to_string())
);
assert_eq!(LatexExtractor::extract_includegraphics_path("no graphics here"), None);
}
#[test]
fn test_extract_caption() {
assert_eq!(
LatexExtractor::extract_caption("\\caption{A nice figure}"),
Some("A nice figure".to_string())
);
assert_eq!(LatexExtractor::extract_caption("no caption"), None);
}
#[test]
fn test_read_braced_content_nested() {
let (content, consumed) = LatexExtractor::read_braced_content("outer {inner} end}rest").unwrap();
assert_eq!(content, "outer {inner} end");
assert_eq!(&"outer {inner} end}rest"[consumed..], "rest");
}
#[test]
fn test_build_document_structure_with_metadata() {
let latex = r"\title{Test}
\author{Author}
\date{2024}
\begin{document}
Hello.
\end{document}";
let doc = LatexExtractor::build_document_structure(latex);
assert!(doc.validate().is_ok());
let meta = doc.nodes.iter().find(|n| {
matches!(&n.content, NodeContent::MetadataBlock { entries } if entries.iter().any(|(k, _)| k == "title"))
});
assert!(meta.is_some(), "should have metadata block");
}
#[test]
fn test_build_document_structure_with_footnote() {
let latex = r"\begin{document}
Text with\footnote{A note} more.
\end{document}";
let doc = LatexExtractor::build_document_structure(latex);
assert!(doc.validate().is_ok());
let has_footnote = doc
.nodes
.iter()
.any(|n| matches!(&n.content, NodeContent::Footnote { text } if text.contains("A note")));
assert!(has_footnote);
}
#[test]
fn test_build_document_structure_with_figure() {
let latex = r"\begin{document}
\begin{figure}
\includegraphics{img.png}
\caption{My caption}
\end{figure}
\end{document}";
let doc = LatexExtractor::build_document_structure(latex);
assert!(doc.validate().is_ok());
let img = doc
.nodes
.iter()
.find(|n| matches!(&n.content, NodeContent::Image { .. }));
assert!(img.is_some(), "should have image node");
let img = img.unwrap();
match &img.content {
NodeContent::Image { description, .. } => {
assert_eq!(description.as_deref(), Some("img.png"));
}
_ => unreachable!(),
}
assert_eq!(
img.attributes
.as_ref()
.and_then(|a| a.get("caption"))
.map(|s| s.as_str()),
Some("My caption")
);
}
#[test]
fn test_build_document_structure_inline_annotations() {
let latex = r"\begin{document}
This is \textbf{bold} and \emph{italic}.
\end{document}";
let doc = LatexExtractor::build_document_structure(latex);
assert!(doc.validate().is_ok());
let para = doc
.nodes
.iter()
.find(|n| matches!(&n.content, NodeContent::Paragraph { text } if text.contains("bold")))
.expect("should have paragraph");
assert!(!para.annotations.is_empty(), "should have annotations");
assert!(para.annotations.iter().any(|a| matches!(a.kind, AnnotationKind::Bold)));
assert!(
para.annotations
.iter()
.any(|a| matches!(a.kind, AnnotationKind::Italic))
);
}
}