use tree_sitter::Node;
use super::{ProseRange, shared};
const SKIP_ENV_KINDS: &[&str] = &[
"verbatim_environment",
"minted_environment",
"listing_environment",
"comment_environment",
"math_environment",
"asy_environment",
"luacode_environment",
"pycode_environment",
"sageblock_environment",
"sagesilent_environment",
];
const SKIP_GENERIC_ENVS: &[&str] = &[
"algorithm",
"algorithmic",
"lstlisting",
"equation",
"equation*",
"align",
"align*",
"gather",
"gather*",
"multline",
"multline*",
"flalign",
"flalign*",
"split",
"mathpar",
"mathpar*",
"IEEEeqnarray",
"IEEEeqnarray*",
"tikzpicture",
"pgfpicture",
"forest",
"tabular",
"tabular*",
"array",
"matrix",
"bmatrix",
"pmatrix",
"vmatrix",
"Bmatrix",
"Vmatrix",
"cases",
"bnf",
];
const SKIP_NODES: &[&str] = &["inline_formula", "displayed_equation"];
const STRUCTURAL_NODES: &[&str] = &[
"command_name",
"graphics_include",
"label_definition",
"label_reference",
"citation",
"package_include",
"bibstyle_include",
];
const SKIP_GENERIC_COMMANDS: &[&str] = &[
"thispagestyle",
"pagestyle",
"bibliographystyle",
"bibliography",
"setcounter",
"addtocounter",
"setlength",
"addtolength",
"newcommand",
"renewcommand",
"newenvironment",
"renewenvironment",
"DeclareMathOperator",
"definecolor",
"hypersetup",
"geometry",
"input",
"include",
"hfill",
"vfill",
"hspace",
"vspace",
"smallskip",
"medskip",
"bigskip",
"hrule",
"vrule",
"newpage",
"clearpage",
"maketitle",
"tableofcontents",
"listoffigures",
"listoftables",
"texttt",
"verb",
"lstinline",
"mintinline",
"url",
"href",
"path",
];
#[derive(Default)]
pub struct LatexExtras<'a> {
pub skip_envs: &'a [String],
pub skip_commands: &'a [String],
}
pub(crate) fn extract(text: &str, root: Node, extras: &LatexExtras) -> Vec<ProseRange> {
let doc_start = find_document_body_start(root, text);
let mut word_ranges: Vec<(usize, usize)> = Vec::new();
collect_words(root, text, doc_start, false, extras, &mut word_ranges);
shared::merge_ranges(
&word_ranges,
text,
strip_latex_noise,
collect_gap_exclusions,
)
}
fn is_structural_node(kind: &str) -> bool {
if kind.starts_with("brack_group") {
return true;
}
if kind.starts_with("curly_group_") {
return true;
}
STRUCTURAL_NODES.contains(&kind)
}
fn find_document_body_start(root: Node, text: &str) -> usize {
let mut cursor = root.walk();
for child in root.children(&mut cursor) {
if child.kind() == "generic_environment"
&& let Some(begin_node) = child.child_by_field_name("begin")
{
let begin_text = &text[begin_node.start_byte()..begin_node.end_byte()];
if begin_text.contains("document") {
return begin_node.end_byte();
}
}
}
0
}
fn collect_words(
node: Node,
text: &str,
doc_start: usize,
in_structural: bool,
extras: &LatexExtras,
out: &mut Vec<(usize, usize)>,
) {
if node.end_byte() <= doc_start {
return;
}
let kind = node.kind();
if SKIP_ENV_KINDS.contains(&kind) || SKIP_NODES.contains(&kind) {
return;
}
if kind == "generic_environment" && should_skip_generic_env(node, text, extras.skip_envs) {
return;
}
if kind == "generic_command" && should_skip_generic_command(node, text, extras.skip_commands) {
return;
}
let structural = in_structural || is_structural_node(kind);
if kind == "word" {
if !structural {
let start = node.start_byte();
let end = node.end_byte();
if start >= doc_start && start < end {
out.push((start, end));
}
}
return;
}
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
collect_words(child, text, doc_start, structural, extras, out);
}
}
fn should_skip_generic_env(node: Node, text: &str, extra_skip_envs: &[String]) -> bool {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if child.kind() != "begin" {
continue;
}
let mut inner = child.walk();
for bc in child.children(&mut inner) {
if bc.kind() != "curly_group_text" {
continue;
}
let mut name_cursor = bc.walk();
for name_child in bc.children(&mut name_cursor) {
if name_child.kind() != "text" {
continue;
}
let env_name = &text[name_child.start_byte()..name_child.end_byte()];
let env_name = env_name.trim();
if SKIP_GENERIC_ENVS.contains(&env_name) {
return true;
}
return extra_skip_envs.iter().any(|e| e == env_name);
}
}
break;
}
false
}
fn should_skip_generic_command(node: Node, text: &str, extra_skip_commands: &[String]) -> bool {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if child.kind() == "command_name" {
let raw = &text[child.start_byte()..child.end_byte()];
let name = raw.strip_prefix('\\').unwrap_or(raw);
if SKIP_GENERIC_COMMANDS.contains(&name) {
return true;
}
return extra_skip_commands.iter().any(|c| c == name);
}
}
false
}
fn skip_inline_math_exclusion(
bytes: &[u8],
i: usize,
gap_offset: usize,
out: &mut Vec<(usize, usize)>,
) -> usize {
let mut j = i + 1;
while j < bytes.len() && bytes[j] != b'$' {
j += 1;
}
if j < bytes.len() {
j += 1; }
out.push((gap_offset + i, gap_offset + j));
j
}
fn skip_display_math_exclusion(
bytes: &[u8],
i: usize,
gap_offset: usize,
out: &mut Vec<(usize, usize)>,
) -> usize {
let len = bytes.len();
let mut exc_start = i;
while exc_start > 0 && bytes[exc_start - 1].is_ascii_whitespace() {
exc_start -= 1;
}
let mut j = i + 2;
while j + 1 < len && !(bytes[j] == b'\\' && bytes[j + 1] == b']') {
j += 1;
}
if j + 1 < len {
j += 2;
}
let mut exc_end = j;
while exc_end < len && bytes[exc_end].is_ascii_whitespace() {
exc_end += 1;
}
out.push((gap_offset + exc_start, gap_offset + exc_end));
exc_end
}
fn skip_inline_paren_math_exclusion(
bytes: &[u8],
i: usize,
gap_offset: usize,
out: &mut Vec<(usize, usize)>,
) -> usize {
let mut j = i + 2;
let len = bytes.len();
while j + 1 < len && !(bytes[j] == b'\\' && bytes[j + 1] == b')') {
j += 1;
}
if j + 1 < len {
j += 2;
}
out.push((gap_offset + i, gap_offset + j));
j
}
fn collect_gap_exclusions(gap: &str, gap_offset: usize, out: &mut Vec<(usize, usize)>) {
let bytes = gap.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
if bytes[i] == b'$' {
i = skip_inline_math_exclusion(bytes, i, gap_offset, out);
} else if i + 1 < len && bytes[i] == b'\\' && bytes[i + 1] == b'[' {
i = skip_display_math_exclusion(bytes, i, gap_offset, out);
} else if i + 1 < len && bytes[i] == b'\\' && bytes[i + 1] == b'(' {
i = skip_inline_paren_math_exclusion(bytes, i, gap_offset, out);
} else if i + 1 < len && bytes[i] == b'\\' && bytes[i + 1].is_ascii_alphabetic() {
let start = i;
i += 1;
while i < len && bytes[i].is_ascii_alphabetic() {
i += 1;
}
if i < len && bytes[i] == b'*' {
i += 1;
}
i = shared::skip_command_args_bytes(bytes, i, &[(b'{', b'}'), (b'[', b']')]);
out.push((gap_offset + start, gap_offset + i));
} else if i + 1 < len && bytes[i] == b'\\' {
out.push((gap_offset + i, gap_offset + i + 2));
i += 2;
} else if bytes[i] == b'{' || bytes[i] == b'}' {
out.push((gap_offset + i, gap_offset + i + 1));
i += 1;
} else {
i += 1;
}
}
}
fn strip_latex_noise(gap: &str) -> String {
let mut result = String::new();
let chars: Vec<char> = gap.chars().collect();
let mut i = 0;
while i < chars.len() {
if chars[i] == '$' {
i += 1;
while i < chars.len() && chars[i] != '$' {
i += 1;
}
i += 1;
result.push(' ');
} else if chars[i] == '\\'
&& i + 1 < chars.len()
&& (chars[i + 1] == '[' || chars[i + 1] == '(')
{
let close = if chars[i + 1] == '[' { ']' } else { ')' };
i += 2;
while i + 1 < chars.len() && !(chars[i] == '\\' && chars[i + 1] == close) {
i += 1;
}
if i + 1 < chars.len() {
i += 2;
}
result.push(' ');
} else if chars[i] == '\\' && i + 1 < chars.len() && chars[i + 1].is_ascii_alphabetic() {
let cmd_start = i + 1;
let mut j = cmd_start;
while j < chars.len() && chars[j].is_ascii_alphabetic() {
j += 1;
}
let cmd: String = chars[cmd_start..j].iter().collect();
if matches!(
cmd.as_str(),
"begin"
| "end"
| "item"
| "par"
| "section"
| "subsection"
| "subsubsection"
| "paragraph"
| "chapter"
| "part"
| "hfill"
| "vfill"
| "newline"
| "linebreak"
| "noindent"
) {
result.push(chars[i]);
i += 1;
continue;
}
i = j;
if i < chars.len() && chars[i] == '*' {
i += 1;
}
i = shared::skip_command_args_chars(&chars, i, &[('{', '}'), ('[', ']')]);
} else if chars[i] == '\\' {
i += 1;
if i < chars.len() {
i += 1;
}
} else {
result.push(chars[i]);
i += 1;
}
}
result
}
#[cfg(test)]
mod tests {
use super::LatexExtras;
use crate::prose::ProseExtractor;
use anyhow::Result;
#[test]
fn test_latex_basic_extraction() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\usepackage{amsmath}
\begin{document}
\section{Introduction}
This is a simple paragraph with some text.
\textbf{Bold text} and \textit{italic text} here.
\begin{verbatim}
This should be ignored completely.
\end{verbatim}
Another paragraph after verbatim.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("simple paragraph")),
"Should extract prose text, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("Bold text")),
"Should extract text inside \\textbf, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("ignored completely")),
"Should NOT extract verbatim content, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("\\textbf")),
"Should NOT contain latex commands, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("\\documentclass")),
"Should NOT contain preamble, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_latex_math_excluded() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
Some text before math.
$x^2 + y^2 = z^2$
Text after inline math.
\[
\int_0^1 f(x) \, dx
\]
Text after display math.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("before math")),
"Should extract text before math, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("after inline math")),
"Should extract text after math, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("x^2")),
"Should NOT extract inline math, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("\\int")),
"Should NOT extract display math, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_latex_preamble_excluded() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\usepackage{amsmath}
\title{My Document}
\author{John Doe}
\begin{document}
Hello world.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("Hello world")),
"Should extract body text, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("My Document")),
"Should NOT extract title from preamble, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("John Doe")),
"Should NOT extract author from preamble, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_latex_no_document_env() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\section{Test}
Some text here.
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("text here")),
"Should extract text from snippet without document env, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_latex_real_content() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass[10pt]{article}
\usepackage{styles/pagestyle}
\usepackage{styles/codestyle}
\begin{document}
{\scshape Notes } \hfill {\scshape \large } \hfill {\scshape \today}
\smallskip
\hrule
\bigskip
\section{Insertion sort}
There are two popular variants of insertsion sort you typically see
\begin{algorithm}[H]
\caption{InsertionSort A}
\begin{algorithmic}[1]
\State $i \gets 1$
\While{$i < \text{length}(A)$}
\State $j \gets i$
\EndWhile
\end{algorithmic}
\end{algorithm}
\subsection{InsertionSort A}
The invariants for this version are relatively straightforward. The first invariant we specify is that the outer loop variable $i$ is always between $1$ and the length of the array (inclusive). So
\[
1 \leq i \leq \text{length}(A) \tag{Index Constraint}
\]
Secondly, for the outer loop, we weaken the postcondition with the index variable $i$ to get the invariant that the subarray $A[0..i)$ is sorted.
\begin{grayblock}
One sidenote we can actually weaken the `elements greater than key' invariant as follows
\[
\forall k.\ j < k \leq i \to A[k] \geq \text{key}
\]
\end{grayblock}
\begin{minted}{dafny}
method InsertionSortA(a : array<int>)
modifies a
requires a.Length >= 1
\end{minted}
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("popular variants")),
"Should extract prose about variants, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("Insertion sort")),
"Should extract section heading, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("\\section")),
"Should NOT contain \\section command, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("pagestyle")),
"Should NOT contain preamble, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("InsertionSortA")),
"Should NOT contain minted code, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("\\caption")),
"Should NOT contain algorithm content, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_latex_algorithm_env() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
Text before algorithm.
\begin{algorithm}[H]
\caption{InsertionSort}
\begin{algorithmic}[1]
\State $i \gets 1$
\end{algorithmic}
\end{algorithm}
Text after algorithm.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("before algorithm")),
"Should extract text before algorithm, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("after algorithm")),
"Should extract text after algorithm, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_latex_inline_math_bridges() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
The variable $i$ is always between $1$ and the length of the array.
Some text, with a comma and more text after it.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("variable")
&& t.contains("always between")
&& t.contains("length")),
"Sentence with inline math should be a single chunk bridging across $i$ and $1$, got: {extracted:?}"
);
assert!(
extracted
.iter()
.any(|t| t.contains("text,") || (t.contains("text") && t.contains("comma"))),
"Sentence with comma should stay together, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_latex_includegraphics_not_extracted() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
Some text before.
\includegraphics[width=0.5\textwidth]{array.pdf}
Some text after.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
!extracted.iter().any(|t| *t == "width" || *t == "0.5"),
"Should NOT extract includegraphics optional args, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("text before")),
"Should extract prose before includegraphics, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("text after")),
"Should extract prose after includegraphics, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_latex_display_math_excluded_from_text() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
We know that
\[
x^2 + y^2 = z^2
\]
which proves our claim.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let bridged = ranges.iter().any(|r| {
let raw = &text[r.start_byte..r.end_byte];
raw.contains("know that") && raw.contains("proves our claim")
});
assert!(
bridged,
"Sentence should bridge across display math, got: {:?}",
ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect::<Vec<_>>()
);
let bridged_range = ranges
.iter()
.find(|r| {
let raw = &text[r.start_byte..r.end_byte];
raw.contains("know that") && raw.contains("proves our claim")
})
.expect("Should have a bridged range");
assert!(
!bridged_range.exclusions.is_empty(),
"Should have exclusions for display math"
);
let clean_text = bridged_range.extract_text(text);
assert!(
!clean_text.contains("x^2"),
"extract_text should not contain math content, got: {:?}",
clean_text
);
assert!(
clean_text.contains("know that"),
"extract_text should still contain prose, got: {:?}",
clean_text
);
assert!(
!clean_text.contains('\n'),
"extract_text should blank newlines around display math, got: {:?}",
clean_text
);
Ok(())
}
#[test]
fn test_latex_display_math_no_false_capitalization() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
Thus, our invariant for the inner loop is:
\[
\forall p, q.\ 0 \leq p < q
\]
the intuition here being that all elements are in sorted order.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let bridged = ranges.iter().find(|r| {
let raw = &text[r.start_byte..r.end_byte];
raw.contains("invariant") && raw.contains("intuition")
});
assert!(
bridged.is_some(),
"Should bridge across display math, got: {:?}",
ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect::<Vec<_>>()
);
let range = bridged.unwrap();
let clean = range.extract_text(text);
assert!(
clean.contains("is:") && clean.contains("the intuition"),
"Prose should flow continuously, got: {:?}",
clean
);
assert!(
!clean.contains("\\forall"),
"Math commands should be blanked, got: {:?}",
clean
);
Ok(())
}
#[test]
fn test_latex_mathpar_skipped() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
We define the rules as follows
\begin{mathpar}
\inferrule
{ }
{\Gamma \vdash n : \text{num}} \quad \text{T-Num}
\inferrule
{\Gamma (x) = \tau}
{\Gamma \vdash x : \tau} \quad \text{T-Var}
\end{mathpar}
The proof is complete.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("define the rules")),
"Should extract prose before mathpar, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("proof is complete")),
"Should extract prose after mathpar, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("T-Num")),
"Should NOT extract inference rule labels, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("T-Var")),
"Should NOT extract inference rule labels, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| *t == "x" || *t == "n"),
"Should NOT extract single variable names from mathpar, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_latex_thispagestyle_skipped() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
\thispagestyle{empty}
Hello world.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
!extracted.iter().any(|t| t.contains("empty")),
"Should NOT extract thispagestyle argument, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("Hello world")),
"Should extract body prose, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_latex_hfill_breaks_ranges() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
{\scshape LV } \hfill {\scshape \large Assignment 1} \hfill {\scshape \today}
Some real prose here.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
!extracted
.iter()
.any(|t| t.contains("LV") && t.contains("Assignment")),
"\\hfill should break ranges, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("real prose")),
"Should extract body prose, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_latex_bnf_env_skipped() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
The syntax is defined as follows.
\begin{bnf}(
prod-delim={--},
comment={//},
)[
colspec = {llcll},
]
e // Expr ::=
| n // number
\end{bnf}
That concludes the grammar.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("syntax is defined")),
"Should extract prose before bnf, got: {extracted:?}"
);
assert!(
extracted
.iter()
.any(|t| t.contains("concludes the grammar")),
"Should extract prose after bnf, got: {extracted:?}"
);
assert!(
!extracted
.iter()
.any(|t| t.contains("prod-delim") || t.contains("colspec")),
"Should NOT extract bnf parameters, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_latex_inline_math_excluded_from_text() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
The value $x + 1$ is positive and $y - 2$ is negative.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let range = ranges
.iter()
.find(|r| {
let raw = &text[r.start_byte..r.end_byte];
raw.contains("value") && raw.contains("positive")
})
.expect("Should have a range containing the sentence");
let clean = range.extract_text(text);
assert!(
!clean.contains("x + 1"),
"extract_text should not contain inline math, got: {clean:?}"
);
assert!(
!clean.contains("y - 2"),
"extract_text should not contain second inline math, got: {clean:?}"
);
assert!(
clean.contains("value"),
"extract_text should preserve prose, got: {clean:?}"
);
assert!(
clean.contains("positive"),
"extract_text should preserve prose after math, got: {clean:?}"
);
Ok(())
}
#[test]
fn test_latex_paren_math_excluded_from_text() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
We define \(f(x) = x^2\) for all reals.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let range = ranges
.iter()
.find(|r| {
let raw = &text[r.start_byte..r.end_byte];
raw.contains("define") && raw.contains("reals")
})
.expect("Should have a range containing the sentence");
let clean = range.extract_text(text);
assert!(
!clean.contains("f(x)"),
"extract_text should not contain \\(...\\) math, got: {clean:?}"
);
assert!(
clean.contains("define"),
"extract_text should preserve prose, got: {clean:?}"
);
Ok(())
}
#[test]
fn test_latex_command_excluded_from_text() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
The \textsc{Foo} method solves \textbf{bar} problems.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let range = ranges
.iter()
.find(|r| {
let raw = &text[r.start_byte..r.end_byte];
raw.contains("method") && raw.contains("solves")
})
.expect("Should have a range containing the sentence");
let clean = range.extract_text(text);
assert!(
!clean.contains("\\textsc"),
"extract_text should not contain \\textsc command, got: {clean:?}"
);
assert!(
!clean.contains("\\textbf"),
"extract_text should not contain \\textbf command, got: {clean:?}"
);
assert!(
clean.contains("method"),
"extract_text should preserve surrounding prose, got: {clean:?}"
);
assert!(
clean.contains("solves"),
"extract_text should preserve surrounding prose, got: {clean:?}"
);
Ok(())
}
#[test]
fn test_latex_custom_skip_env() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
Text before custom env.
\begin{prooftree}
Some proof tree content here.
\end{prooftree}
Text after custom env.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("proof tree content")),
"Without config, prooftree content should be extracted, got: {extracted:?}"
);
let extra = vec!["prooftree".to_string()];
let extras = LatexExtras {
skip_envs: &extra,
..LatexExtras::default()
};
let ranges = extractor.extract(text, "latex", &extras)?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
!extracted.iter().any(|t| t.contains("proof tree content")),
"With config, prooftree content should be skipped, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("Text before")),
"Prose before should still be extracted, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("Text after")),
"Prose after should still be extracted, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_texttt_content_not_extracted() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
Use the \texttt{myvar} variable in your code.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let range = ranges
.iter()
.find(|r| {
let raw = &text[r.start_byte..r.end_byte];
raw.contains("variable")
})
.expect("Should have a range containing surrounding prose");
let clean = range.extract_text(text);
assert!(
!clean.contains("myvar"),
"extract_text should not contain \\texttt argument, got: {clean:?}"
);
assert!(
clean.contains("variable"),
"extract_text should preserve surrounding prose, got: {clean:?}"
);
Ok(())
}
#[test]
fn test_custom_skip_command() -> Result<()> {
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = r"\documentclass{article}
\begin{document}
The \codefont{badspeling} function works.
\end{document}
";
let ranges = extractor.extract(text, "latex", &LatexExtras::default())?;
let clean_texts: Vec<_> = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
clean_texts.iter().any(|t| t.contains("badspeling")),
"Without config, codefont content should appear in prose, got: {clean_texts:?}"
);
let skip_cmds = vec!["codefont".to_string()];
let extras = LatexExtras {
skip_commands: &skip_cmds,
..LatexExtras::default()
};
let ranges = extractor.extract(text, "latex", &extras)?;
let range = ranges
.iter()
.find(|r| {
let raw = &text[r.start_byte..r.end_byte];
raw.contains("function works")
})
.expect("Should have a range with surrounding prose");
let clean = range.extract_text(text);
assert!(
!clean.contains("badspeling"),
"With config, codefont argument should not appear in extract_text, got: {clean:?}"
);
assert!(
clean.contains("function works"),
"Surrounding prose should be preserved, got: {clean:?}"
);
Ok(())
}
}