use tree_sitter::Node;
use super::{ProseRange, shared};
const STRUCTURAL_COMMANDS: &[&str] = &[
"\\import",
"\\export",
"\\transclude",
"\\ref",
"\\author",
"\\contributor",
"\\date",
"\\parent",
"\\tag",
"\\taxon",
"\\meta",
"\\number",
"\\def",
"\\let",
"\\alloc",
"\\open",
"\\namespace",
"\\put",
"\\get",
"\\put?",
"\\object",
"\\patch",
"\\call",
"\\tex",
"\\codeblock",
"\\pre",
"\\startverb",
"\\xmlns",
"\\query",
"\\datalog",
"\\code",
];
const BLOCK_COMMANDS: &[&str] = &[
"\\p",
"\\li",
"\\ol",
"\\ul",
"\\title",
"\\blockquote",
"\\figure",
"\\figcaption",
"\\scope",
"\\subtree",
];
const INLINE_COMMANDS: &[&str] = &["\\em", "\\strong"];
const SKIP_KINDS: &[&str] = &[
"inline_math",
"display_math",
"verbatim",
"comment",
"wiki_link",
"command_name",
];
pub fn extract(text: &str, root: Node) -> Vec<ProseRange> {
let mut scopes: Vec<Vec<(usize, usize)>> = vec![vec![]];
let mut skips: Vec<(usize, usize)> = Vec::new();
collect_prose_nodes(root, text, &mut scopes, &mut skips, false);
skips.extend(find_math_regions(text));
let mut result: Vec<ProseRange> = scopes
.iter()
.filter(|s| !s.is_empty())
.flat_map(|scope| {
shared::merge_ranges(
scope,
text,
strip_forester_noise,
collect_forester_exclusions,
)
})
.collect();
shared::install_skip_exclusions(&mut result, &skips, text.as_bytes());
shared::dedup_exclusions(&mut result);
result.retain(|r| !shared::is_fully_excluded(r));
result
}
fn get_command_name<'a>(node: Node, text: &'a str) -> Option<&'a str> {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if child.kind() == "command_name" {
return Some(&text[child.start_byte()..child.end_byte()]);
}
}
None
}
fn strip_inline_commands_in_range(
slice: &str,
base_offset: usize,
skips: &mut Vec<(usize, usize)>,
) {
let bytes = slice.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
if bytes[i] == b'\\' && i + 1 < len && bytes[i + 1].is_ascii_alphabetic() {
let cmd_start = i;
i += 1;
while i < len && (bytes[i].is_ascii_alphabetic() || bytes[i] == b'?') {
i += 1;
}
if i < len && bytes[i] == b'{' {
skips.push((base_offset + cmd_start, base_offset + i + 1));
let close = shared::skip_balanced_bytes(bytes, i + 1, b'{', b'}', None);
if close > i + 1 {
skips.push((base_offset + close - 1, base_offset + close));
}
i += 1;
} else {
skips.push((base_offset + cmd_start, base_offset + i));
}
} else {
i += 1;
}
}
}
fn single_letter_kept_as_prose(trimmed: &str) -> bool {
matches!(trimmed.as_bytes(), [b] if b.is_ascii_uppercase() && *b != b'I')
}
fn inline_math_single_letter(node: Node, text: &str) -> Option<usize> {
let raw = &text[node.start_byte()..node.end_byte()];
let inner = raw.strip_prefix("#{")?.strip_suffix('}')?;
let trimmed = inner.trim();
if single_letter_kept_as_prose(trimmed) {
let inner_start = node.start_byte() + "#{".len();
let offset_in_inner = inner.find(trimmed)?;
Some(inner_start + offset_in_inner)
} else {
None
}
}
fn push_to_scope(node: Node, scopes: &mut [Vec<(usize, usize)>]) {
let start = node.start_byte();
let end = node.end_byte();
if start < end
&& let Some(scope) = scopes.last_mut()
{
scope.push((start, end));
}
}
fn handle_markdown_link(
node: Node,
text: &str,
scopes: &mut [Vec<(usize, usize)>],
skips: &mut Vec<(usize, usize)>,
in_prose: bool,
) {
let node_text = &text[node.start_byte()..node.end_byte()];
let Some(close_bracket) = node_text.find(']') else {
skips.push((node.start_byte(), node.end_byte()));
return;
};
let alias_start = node.start_byte() + 1; let alias_end = node.start_byte() + close_bracket;
if alias_start >= alias_end {
skips.push((node.start_byte(), node.end_byte()));
return;
}
skips.push((node.start_byte(), alias_start)); strip_inline_commands_in_range(&text[alias_start..alias_end], alias_start, skips);
skips.push((alias_end, node.end_byte())); if in_prose && let Some(scope) = scopes.last_mut() {
scope.push((node.start_byte(), node.end_byte()));
}
}
fn handle_command(
node: Node,
text: &str,
scopes: &mut Vec<Vec<(usize, usize)>>,
skips: &mut Vec<(usize, usize)>,
) {
let cmd_name = get_command_name(node, text);
if cmd_name.is_some_and(|n| STRUCTURAL_COMMANDS.contains(&n)) {
return;
}
if cmd_name.is_some_and(|n| BLOCK_COMMANDS.contains(&n)) {
scopes.push(vec![]);
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if child.kind() != "bracket_group" {
collect_prose_nodes(child, text, scopes, skips, true);
}
}
scopes.push(vec![]);
return;
}
if cmd_name.is_some_and(|n| INLINE_COMMANDS.contains(&n)) {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if child.kind() == "brace_group" {
skips.push((child.start_byte(), child.start_byte() + 1));
skips.push((child.end_byte() - 1, child.end_byte()));
}
collect_prose_nodes(child, text, scopes, skips, true);
}
return;
}
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
collect_prose_nodes(child, text, scopes, skips, false);
}
}
fn collect_prose_nodes(
node: Node,
text: &str,
scopes: &mut Vec<Vec<(usize, usize)>>,
skips: &mut Vec<(usize, usize)>,
in_prose: bool,
) {
let kind = node.kind();
if SKIP_KINDS.contains(&kind) {
if kind == "inline_math"
&& let Some(letter_byte) = inline_math_single_letter(node, text)
{
skips.push((node.start_byte(), letter_byte));
skips.push((letter_byte + 1, node.end_byte()));
if in_prose {
push_to_scope(node, scopes);
}
return;
}
skips.push((node.start_byte(), node.end_byte()));
return;
}
if kind == "markdown_link" {
handle_markdown_link(node, text, scopes, skips, in_prose);
return;
}
if kind == "command" {
handle_command(node, text, scopes, skips);
return;
}
if in_prose && (kind == "paren_group" || kind == "text") {
push_to_scope(node, scopes);
return;
}
let child_prose = in_prose || kind == "source_file";
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
collect_prose_nodes(child, text, scopes, skips, child_prose);
}
}
fn strip_forester_noise(gap: &str) -> String {
let mut result = String::new();
let chars: Vec<char> = gap.chars().collect();
let mut i = 0;
while i < chars.len() {
if chars[i] == '#' && i + 2 < chars.len() && chars[i + 1] == '#' && chars[i + 2] == '{' {
i = shared::skip_balanced_chars(&chars, i + 3, '{', '}');
result.push(' ');
} else if chars[i] == '#' && i + 1 < chars.len() && chars[i + 1] == '{' {
i = shared::skip_balanced_chars(&chars, i + 2, '{', '}');
result.push(' ');
} else if chars[i] == '\\' && i + 1 < chars.len() && chars[i + 1].is_ascii_alphanumeric() {
i += 1;
while i < chars.len()
&& (chars[i].is_ascii_alphanumeric()
|| chars[i] == '-'
|| chars[i] == '/'
|| chars[i] == '?'
|| chars[i] == '*')
{
i += 1;
}
i = shared::skip_command_args_chars(&chars, i, &[('{', '}'), ('[', ']'), ('(', ')')]);
} else if chars[i] == '\\' && i + 1 < chars.len() {
i += 2;
} else if chars[i] == '%' {
while i < chars.len() && chars[i] != '\n' {
i += 1;
}
} else {
result.push(chars[i]);
i += 1;
}
}
result
}
fn collect_forester_exclusions(gap: &str, offset: usize, exclusions: &mut Vec<(usize, usize)>) {
let b = gap.as_bytes();
let len = b.len();
let mut i = 0;
while i < len {
let start = i;
if b[i] == b'#' && i + 2 < len && b[i + 1] == b'#' && b[i + 2] == b'{' {
i = shared::skip_balanced_bytes(b, i + 3, b'{', b'}', Some(b'\\')); exclusions.push((offset + start, offset + i));
} else if b[i] == b'#' && i + 1 < len && b[i + 1] == b'{' {
i = shared::skip_balanced_bytes(b, i + 2, b'{', b'}', Some(b'\\')); exclusions.push((offset + start, offset + i));
} else if b[i] == b'\\' && i + 1 < len && b[i + 1].is_ascii_alphanumeric() {
i = skip_command_with_args(b, i); exclusions.push((offset + start, offset + i));
} else if b[i] == b'\\' && i + 1 < len {
i += 2; exclusions.push((offset + start, offset + i));
} else if b[i] == b'%' {
while i < len && b[i] != b'\n' {
i += 1;
}
exclusions.push((offset + start, offset + i));
} else {
i += 1;
}
}
}
fn skip_command_with_args(b: &[u8], mut i: usize) -> usize {
i += 1; while i < b.len() && (b[i].is_ascii_alphanumeric() || matches!(b[i], b'-' | b'/' | b'?' | b'*'))
{
i += 1;
}
shared::skip_command_args_bytes(b, i, &[(b'{', b'}'), (b'[', b']'), (b'(', b')')])
}
fn find_math_regions(text: &str) -> Vec<(usize, usize)> {
let bytes = text.as_bytes();
let len = bytes.len();
let mut regions = Vec::new();
let mut i = 0;
while i < len {
if bytes[i] == b'%' {
while i < len && bytes[i] != b'\n' {
i += 1;
}
continue;
}
if bytes[i] == b'\\' && i + 1 < len {
i += 2;
continue;
}
if bytes[i] == b'#' && i + 2 < len && bytes[i + 1] == b'#' && bytes[i + 2] == b'{' {
let start = i;
i = shared::skip_balanced_bytes(bytes, i + 3, b'{', b'}', Some(b'\\'));
regions.push((start, i));
continue;
}
if bytes[i] == b'#' && i + 1 < len && bytes[i + 1] == b'{' {
let start = i;
let end = shared::skip_balanced_bytes(bytes, i + 2, b'{', b'}', Some(b'\\'));
let inner = &text[start + 2..end.saturating_sub(1).max(start + 2)];
let trimmed = inner.trim();
if single_letter_kept_as_prose(trimmed) {
let letter_offset = start + 2 + inner.find(trimmed).unwrap_or(0);
regions.push((start, letter_offset));
regions.push((letter_offset + 1, end));
} else {
regions.push((start, end));
}
i = end;
continue;
}
i += 1;
}
regions
}
#[cfg(test)]
mod tests {
use crate::prose::ProseExtractor;
use crate::prose::latex::LatexExtras;
use anyhow::Result;
fn forester_extractor() -> Result<ProseExtractor> {
let language: tree_sitter::Language = crate::forester_ts::LANGUAGE.into();
ProseExtractor::new(language)
}
#[test]
fn test_forester_basic_extraction() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\title{Hello World}
\p{This is a paragraph.}
";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("Hello World")),
"Should extract title text, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("This is a paragraph")),
"Should extract paragraph text, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_forester_math_excluded() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = "\\p{Text before math.}\n\n##{\\int_0^1 f(x) \\, dx}\n\n\\p{Text after math.}\n";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("Text before math")),
"Should extract text before math, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("Text after math")),
"Should extract text after math, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("\\int")),
"Should NOT extract display math content, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_forester_structural_commands_excluded() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\import{trees/basics}
\ref{tree-0001}
\p{Some actual prose.}
";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
!extracted.iter().any(|t| t.contains("trees/basics")),
"Should NOT extract import path, got: {extracted:?}"
);
assert!(
!extracted.iter().any(|t| t.contains("tree-0001")),
"Should NOT extract ref target, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("actual prose")),
"Should extract prose text, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_forester_verbatim_excluded() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = "\\p{Before code.}\n```\nfn main() {}\n```\n\\p{After code.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
!extracted.iter().any(|t| t.contains("fn main")),
"Should NOT extract verbatim content, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("Before code")),
"Should extract text before verbatim, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("After code")),
"Should extract text after verbatim, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_forester_inline_commands_bridge() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{This has \em{emphasized} words in it.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("This has")
&& t.contains("emphasized")
&& t.contains("words in it")),
"Sentence with inline command should bridge into single chunk, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_forester_display_math_exclusion() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{We know that
##{
x^2 + y^2 = z^2
}
which proves our claim.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let bridged = ranges.iter().find(|r| {
let raw = &text[r.start_byte..r.end_byte];
raw.contains("know that") && raw.contains("proves our claim")
});
assert!(
bridged.is_some(),
"Should bridge across display math, got: {:?}",
ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect::<Vec<_>>()
);
let range = bridged.unwrap();
assert!(
!range.exclusions.is_empty(),
"Should have exclusions for display math"
);
let clean_text = range.extract_text(text);
assert!(
!clean_text.contains("x^2"),
"extract_text should not contain math content, got: {:?}",
clean_text
);
assert!(
clean_text.contains("know that"),
"extract_text should still contain prose, got: {:?}",
clean_text
);
Ok(())
}
#[test]
fn test_forester_list_items_separate_scopes() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\ol{\li{Item one}\li{Item two}\li{Item three}}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
assert!(
ranges.len() >= 3,
"Each list item should be a separate prose range, got {} ranges: {:?}",
ranges.len(),
ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect::<Vec<_>>()
);
assert!(
!ranges.iter().any(|r| {
let t = &text[r.start_byte..r.end_byte];
t.contains("one") && t.contains("two")
}),
"List items should not merge into a single range"
);
Ok(())
}
#[test]
fn test_forester_inline_math_excluded() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{The value #{x + y} is positive.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("The value")),
"Should extract prose around inline math, got: {extracted:?}"
);
let range_with_math = ranges.iter().find(|r| {
let raw = &text[r.start_byte..r.end_byte];
raw.contains("value") && raw.contains("positive")
});
if let Some(range) = range_with_math {
let clean = range.extract_text(text);
assert!(
!clean.contains("x + y"),
"Inline math should be excluded from clean text, got: {:?}",
clean
);
}
Ok(())
}
#[test]
fn test_inline_math_single_letter_included() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{Let #{F} be a functor.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let clean: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
clean.contains('F'),
"Single-letter inline math should be included as prose, got: {clean:?}"
);
assert!(
clean.contains("Let") && clean.contains("be a functor"),
"Surrounding prose should be preserved, got: {clean:?}"
);
assert!(
!clean.contains('#'),
"Math delimiters should be stripped, got: {clean:?}"
);
Ok(())
}
#[test]
fn test_inline_math_single_letter_various() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{The quantity #{v} is fixed here.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let clean: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
!clean.contains('v') && clean.contains("quantity") && clean.contains("fixed"),
"Lowercase single letter should be excluded, got: {clean:?}"
);
let text = r"\p{When #{i} equals zero we stop.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let clean: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
!clean.contains('i') && clean.contains("equals zero"),
"Lowercase i should be excluded, got: {clean:?}"
);
let text = r"\p{Here #{I} denotes the unit.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let clean: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
!clean.contains('I') && clean.contains("denotes the unit"),
"Uppercase pronoun I should be excluded, got: {clean:?}"
);
let text = r"\p{The value #{xy} is large.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let clean: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
!clean.contains("xy"),
"Multi-character math should still be excluded, got: {clean:?}"
);
let text = r"\p{The number #{3} is odd.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let clean: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
!clean.contains('3'),
"Single digit should still be excluded, got: {clean:?}"
);
Ok(())
}
#[test]
fn test_inline_math_single_letter_with_spaces() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{Let #{ G } be a group.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let clean: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
clean.contains('G'),
"Single letter with spaces should be included, got: {clean:?}"
);
Ok(())
}
#[test]
fn test_single_letter_kept_as_prose_predicate() {
use super::single_letter_kept_as_prose;
for s in ["F", "X", "G", "A", "Z"] {
assert!(single_letter_kept_as_prose(s), "{s:?} should be kept");
}
for s in ["i", "j", "x", "v", "I", "3", "xy", "", "+"] {
assert!(!single_letter_kept_as_prose(s), "{s:?} should be excluded");
}
}
#[test]
fn test_forester_block_math_multiline_excluded() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = "\\p{Consider the equation\n##{ x^2 + y^2 = z^2 }\nwhich is well known.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
for range in &ranges {
let clean = range.extract_text(text);
assert!(
!clean.contains("x^2"),
"Block math content should not appear in clean prose, got: {:?}",
clean
);
}
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text))
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("Consider the equation"),
"Prose before block math should be extracted, got: {:?}",
all_text
);
assert!(
all_text.contains("well known"),
"Prose after block math should be extracted, got: {:?}",
all_text
);
Ok(())
}
#[test]
fn test_forester_unknown_macros_recurse() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\solution{
\p{Prose inside unknown wrapper.}
}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted
.iter()
.any(|t| t.contains("Prose inside unknown wrapper")),
"Nested \\p inside unknown macro should be extracted, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_forester_unknown_macros_plain_text_skipped() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{Real prose here.}
\mymacro{macro content}
\p{More real prose.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
!extracted.iter().any(|t| t.contains("macro content")),
"Text inside unknown macro should NOT be extracted, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("Real prose")),
"Known commands should still extract prose, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_forester_nested_blocks_separate_scopes() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\subtree{
\title{My Section}
\p{First paragraph.}
\p{Second paragraph.}
}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted.iter().any(|t| t.contains("My Section")),
"Title inside subtree should be extracted, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("First paragraph")),
"Paragraph inside subtree should be extracted, got: {extracted:?}"
);
assert!(
!ranges.iter().any(|r| {
let t = &text[r.start_byte..r.end_byte];
t.contains("My Section") && t.contains("First paragraph")
}),
"Title and paragraph should be separate scopes"
);
Ok(())
}
#[test]
fn test_forester_display_math_align_inside_li() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\ol{\li{We have the equation
##{
\begin{align*}
\mathcal{C} &\vDash \forall x.\, \varphi(x) \\
&\Rightarrow \psi
\end{align*}
}
which completes the proof.}}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
for range in &ranges {
let clean = range.extract_text(text);
assert!(
!clean.contains("\\mathcal"),
"LaTeX \\mathcal should not leak into prose, got: {clean:?}"
);
assert!(
!clean.contains("\\vDash"),
"LaTeX \\vDash should not leak into prose, got: {clean:?}"
);
assert!(
!clean.contains("\\forall"),
"LaTeX \\forall should not leak into prose, got: {clean:?}"
);
assert!(
!clean.contains("\\begin"),
"LaTeX \\begin should not leak into prose, got: {clean:?}"
);
}
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text))
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("We have the equation"),
"Prose before display math should be extracted, got: {all_text:?}"
);
assert!(
all_text.contains("completes the proof"),
"Prose after display math should be extracted, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_forester_em_command_name_not_leaked() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{This has \em{emphasized} words.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let range = ranges
.iter()
.find(|r| {
let t = &text[r.start_byte..r.end_byte];
t.contains("emphasized")
})
.expect("Should find range containing 'emphasized'");
let clean = range.extract_text(text);
assert!(
!clean.contains("\\em"),
"Command name \\em should not appear in clean text, got: {clean:?}"
);
assert!(
clean.contains("emphasized"),
"Word 'emphasized' should be in clean text, got: {clean:?}"
);
assert!(
clean.contains("This has"),
"Surrounding prose should be in clean text, got: {clean:?}"
);
assert!(
!clean.contains('{'),
"Opening brace should not leak into clean text, got: {clean:?}"
);
assert!(
!clean.contains('}'),
"Closing brace should not leak into clean text, got: {clean:?}"
);
Ok(())
}
#[test]
fn test_unknown_inline_macro_excluded() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\li{The carrier \cf{Fin A.n} is important.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
for range in &ranges {
let clean = range.extract_text(text);
assert!(
!clean.contains("Fin"),
"\\cf content should be excluded, got: {clean:?}"
);
}
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text))
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("The carrier"),
"Prose before \\cf, got: {all_text:?}"
);
assert!(
all_text.contains("is important"),
"Prose after \\cf, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_display_math_escaped_braces_top_level() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{Consider the structure:}
##{
U = \{A, B\} \quad I = \{\texttt{taller} \mapsto \{\langle A, B\rangle\}\}
}
\p{Is it a model?}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
for range in &ranges {
let clean = range.extract_text(text);
assert!(
!clean.contains("\\texttt"),
"\\texttt should not leak into prose, got: {clean:?}"
);
assert!(
!clean.contains("\\mapsto"),
"\\mapsto should not leak into prose, got: {clean:?}"
);
assert!(
!clean.contains("\\langle"),
"\\langle should not leak into prose, got: {clean:?}"
);
}
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text))
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("Consider the structure"),
"Prose before math should be extracted, got: {all_text:?}"
);
assert!(
all_text.contains("Is it a model"),
"Prose after math should be extracted, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_display_math_align_top_level() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{Define the interpretation:}
##{
\begin{align*}
I &= \{a \mapsto \alpha\} \\
I &= \{f(\alpha) \mapsto \beta\}
\end{align*}
}
\p{Evaluate the terms.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
for range in &ranges {
let clean = range.extract_text(text);
assert!(
!clean.contains("\\begin"),
"\\begin should not leak, got: {clean:?}"
);
assert!(
!clean.contains("\\end"),
"\\end should not leak, got: {clean:?}"
);
assert!(
!clean.contains("\\mapsto"),
"\\mapsto should not leak, got: {clean:?}"
);
}
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text))
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("Define the interpretation"),
"Got: {all_text:?}"
);
assert!(all_text.contains("Evaluate the terms"), "Got: {all_text:?}");
Ok(())
}
#[test]
fn test_display_math_escaped_braces_inside_li() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\li{
If we change the interpretation to
##{
I = \{\texttt{taller} \mapsto \{\langle A, B\rangle\}\}
}
is the structure now a model?
}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
for range in &ranges {
let clean = range.extract_text(text);
assert!(
!clean.contains("\\texttt"),
"\\texttt should not leak, got: {clean:?}"
);
assert!(
!clean.contains("\\mapsto"),
"\\mapsto should not leak, got: {clean:?}"
);
}
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text))
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("change the interpretation"),
"Prose before math in \\li, got: {all_text:?}"
);
assert!(
all_text.contains("is the structure now a model"),
"Prose after math in \\li, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_unknown_macro_wrapping_blocks() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\solution{
\p{As a reminder we are working with the axiom.}
\ol{
\li{First item.}
\li{Second item.}
}
}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted
.iter()
.any(|t| t.contains("working with the axiom")),
"\\p inside \\solution should be extracted, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("First item")),
"\\li inside \\solution should be extracted, got: {extracted:?}"
);
assert!(
extracted.iter().any(|t| t.contains("Second item")),
"\\li inside \\solution should be extracted, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_multiple_inline_math_in_li() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\li{#{p(a)} evaluates to #{\top} because #{a = \alpha}.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
for range in &ranges {
let clean = range.extract_text(text);
assert!(
!clean.contains("p(a)"),
"Inline math p(a) should be excluded, got: {clean:?}"
);
assert!(
!clean.contains("\\top"),
"Inline math \\top should be excluded, got: {clean:?}"
);
assert!(
!clean.contains("\\alpha"),
"Inline math \\alpha should be excluded, got: {clean:?}"
);
}
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text))
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("evaluates to"),
"Prose between math should be extracted, got: {all_text:?}"
);
assert!(
all_text.contains("because"),
"Prose between math should be extracted, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_fully_excluded_ranges_filtered() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{Consider:}
##{
U = \{A, B\}
}
\p{Done.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
for range in &ranges {
let clean = range.extract_text(text);
let trimmed = clean.trim();
assert!(
!trimmed.is_empty(),
"No all-blank ranges should remain, got range [{}, {}) with {} exclusions",
range.start_byte,
range.end_byte,
range.exclusions.len()
);
}
Ok(())
}
fn assert_no_errors(source: &str) {
let language: tree_sitter::Language = crate::forester_ts::LANGUAGE.into();
let mut parser = tree_sitter::Parser::new();
parser.set_language(&language).unwrap();
let tree = parser.parse(source, None).unwrap();
let root = tree.root_node();
assert!(
!root.has_error(),
"Parse tree has ERROR/MISSING nodes:\n{}\nSource:\n{source}",
root.to_sexp()
);
}
#[test]
fn test_math_escape_inline_braces() {
assert_no_errors(r"#{\ \mathcal F = \{W, R\} }");
}
#[test]
fn test_math_escape_display_row_separator() {
assert_no_errors(
r"##{
\begin{align*}
x &= \{a, b\} \\
y &= \{c, d\}
\end{align*}
}",
);
}
#[test]
fn test_bare_hash_in_text() {
assert_no_errors(r"\p{See https://q.uiver.app/#q=WzAsMl0= for details.}");
}
#[test]
fn test_math_escape_inline_extraction() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{The family #{\mathcal F = \{W, R\} } is coherent.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
for range in &ranges {
let clean = range.extract_text(text);
assert!(
!clean.contains("\\mathcal"),
"Math content should be excluded, got: {clean:?}"
);
}
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text))
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("The family"),
"Prose before math, got: {all_text:?}"
);
assert!(
all_text.contains("is coherent"),
"Prose after math, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_inline_command_braces_excluded() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\li{
\strong{Explanation}: We know this since the \em{necessarily} modality.
}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
for range in &ranges {
let clean = range.extract_text(text);
assert!(
!clean.contains('{'),
"Opening brace should not leak, got: {clean:?}"
);
assert!(
!clean.contains('}'),
"Closing brace should not leak, got: {clean:?}"
);
}
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text))
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("Explanation"),
"Content inside \\strong should be extracted, got: {all_text:?}"
);
assert!(
all_text.contains("necessarily"),
"Content inside \\em should be extracted, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_display_math_row_separator_extraction() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{We have
##{
a &= b \\
c &= \{d, e\}
}
so the result follows.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
for range in &ranges {
let clean = range.extract_text(text);
assert!(
!clean.contains("&="),
"Math content should not leak, got: {clean:?}"
);
}
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text))
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("We have"),
"Prose before math, got: {all_text:?}"
);
assert!(
all_text.contains("result follows"),
"Prose after math, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_markdown_link_parse() {
assert_no_errors(r"\p{See [frame](006j) for details.}");
}
#[test]
fn test_markdown_link_alias_extracted() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{See [frame](006j) for details.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text))
.collect::<Vec<_>>()
.join(" ");
assert!(
!all_text.contains("006j"),
"Link target should be excluded, got: {all_text:?}"
);
assert!(
!all_text.contains('['),
"Link bracket syntax should be excluded, got: {all_text:?}"
);
assert!(
all_text.contains("See"),
"Prose before link, got: {all_text:?}"
);
assert!(
all_text.contains("frame"),
"Link alias should be extracted as prose, got: {all_text:?}"
);
assert!(
all_text.contains("for details"),
"Prose after link, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_markdown_link_alias_merged_into_prose() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{apply [De Morgan's laws](000g) to push}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text))
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("De Morgan's laws"),
"Link alias should be merged into prose, got: {all_text:?}"
);
assert!(
!all_text.contains("000g"),
"Link address should be excluded, got: {all_text:?}"
);
assert!(
all_text.contains("apply"),
"Prose before link, got: {all_text:?}"
);
assert!(
all_text.contains("to push"),
"Prose after link, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_codeblock_with_errors_no_leakage() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{Here is a recursor example.}
\codeblock{lean}{
macro_rules
| `(ex{ $n:num }) => `(Expr.const $n)
| `(ex{ $x:ident }) => pure $x
}
\p{This defines the syntax.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let all_clean: String = ranges
.iter()
.map(|r| r.extract_text(text).into_owned())
.collect::<Vec<_>>()
.join(" ");
assert!(
!all_clean.contains("macro_rules"),
"Code should not leak from \\codeblock, got: {all_clean:?}"
);
assert!(
!all_clean.contains("Expr.const"),
"Code should not leak from \\codeblock, got: {all_clean:?}"
);
assert!(
all_clean.contains("recursor example"),
"Prose should be extracted, got: {all_clean:?}"
);
assert!(
all_clean.contains("defines the syntax"),
"Prose should be extracted, got: {all_clean:?}"
);
Ok(())
}
#[test]
fn test_inline_code_content_excluded() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{Call \code{teh_function} to begin the proccess.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let all_clean: String = ranges
.iter()
.map(|r| r.extract_text(text).into_owned())
.collect::<Vec<_>>()
.join(" ");
assert!(
!all_clean.contains("teh_function"),
"Inline \\code content should not be checked, got: {all_clean:?}"
);
assert!(
all_clean.contains("Call"),
"Prose before \\code should be extracted, got: {all_clean:?}"
);
assert!(
all_clean.contains("proccess"),
"Prose after \\code should be extracted, got: {all_clean:?}"
);
Ok(())
}
#[test]
fn test_prose_block_continuation_merges() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{Here is something} ##{x = 5} \p{continuation of the idea.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
assert_eq!(
ranges.len(),
1,
"continuation should merge, got: {ranges:?}"
);
let merged = ranges[0].extract_text(text);
assert!(merged.contains("Here is something"), "got: {merged:?}");
assert!(
merged.contains("continuation of the idea"),
"got: {merged:?}"
);
assert!(
!merged.contains("x = 5"),
"math stays excluded, got: {merged:?}"
);
Ok(())
}
#[test]
fn test_prose_blocks_not_merged_when_separate_sentences() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{First complete sentence.} \p{Second complete sentence.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
assert_eq!(
ranges.len(),
2,
"distinct sentences stay separate, got: {ranges:?}"
);
Ok(())
}
#[test]
fn test_block_directive_forces_merge() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = "% lang-check-begin block\n\\p{First complete sentence.}\n\\p{Second complete sentence.}\n% lang-check-end";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
assert_eq!(
ranges.len(),
1,
"block directive forces one merged block, got: {ranges:?}"
);
Ok(())
}
#[test]
fn test_subtree_bracket_id_excluded() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\subtree[006s]{
\taxon{Definition}
\title{Local truth}
\p{Some prose about local truth.}
}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let all_clean: String = ranges
.iter()
.map(|r| r.extract_text(text).into_owned())
.collect::<Vec<_>>()
.join(" ");
assert!(
!all_clean.contains("006s"),
"Subtree ID should not appear in prose, got: {all_clean:?}"
);
assert!(
all_clean.contains("Local truth"),
"Title should be extracted, got: {all_clean:?}"
);
assert!(
all_clean.contains("local truth"),
"Paragraph should be extracted, got: {all_clean:?}"
);
Ok(())
}
#[test]
fn test_paren_group_in_title_preserved() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\title{Negation Normal Form (NNF)}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text).into_owned())
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("(NNF)"),
"Parenthesised abbreviation should be preserved, got: {all_text:?}"
);
assert!(
all_text.contains("Negation Normal Form"),
"Title text should be extracted, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_paren_group_in_paragraph_preserved() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{A formula is in Negation Normal Form (NNF) if negation is only applied to literals.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text).into_owned())
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("(NNF)"),
"Parenthesised abbreviation should be preserved, got: {all_text:?}"
);
assert!(
all_text.contains("if negation"),
"Text after parens should be extracted, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_markdown_link_inline_command_stripped() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{See [\em{some text}](link-to-ignore) for details.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text).into_owned())
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("some text"),
"Inner text of inline command in link alias should be prose, got: {all_text:?}"
);
assert!(
!all_text.contains("\\em"),
"Command name should be stripped, got: {all_text:?}"
);
assert!(
!all_text.contains("link-to-ignore"),
"Link target should be excluded, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_markdown_link_nested_commands_stripped() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{See [\strong{\em{nested}}](addr) here.}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
let all_text: String = ranges
.iter()
.map(|r| r.extract_text(text).into_owned())
.collect::<Vec<_>>()
.join(" ");
assert!(
all_text.contains("nested"),
"Deeply nested text should be extracted, got: {all_text:?}"
);
assert!(
!all_text.contains("\\strong"),
"Outer command should be stripped, got: {all_text:?}"
);
assert!(
!all_text.contains("\\em"),
"Inner command should be stripped, got: {all_text:?}"
);
Ok(())
}
#[test]
fn test_markdown_link_with_command_single_range() -> Result<()> {
let mut extractor = forester_extractor()?;
let text = r"\p{formulas are in [\em{Negation Normal Form}](000e) which in short}";
let ranges = extractor.extract(text, "forester", &LatexExtras::default())?;
assert_eq!(
ranges.len(),
1,
"Link with inline command should not split the paragraph, got {} ranges",
ranges.len()
);
let all_text = ranges[0].extract_text(text);
assert!(
all_text.contains("Negation Normal Form"),
"Link alias text should be present, got: {all_text:?}"
);
assert!(
all_text.contains("formulas are in"),
"Text before link, got: {all_text:?}"
);
assert!(
all_text.contains("which in short"),
"Text after link, got: {all_text:?}"
);
Ok(())
}
}