use tree_sitter::Node;
use super::{ProseRange, shared};
const STRUCTURAL_COMMANDS: &[&str] = &[
"@author", "@date", "@import", "@ref", "@tag", "@id", "@class",
];
const SKIP_KINDS: &[&str] = &[
"inline_math",
"display_math",
"code_block",
"code_span",
"comment",
"command_name",
"link_url",
];
pub fn extract(text: &str, root: Node) -> Vec<ProseRange> {
let mut word_ranges: Vec<(usize, usize)> = Vec::new();
collect_prose_nodes(root, text, false, &mut word_ranges);
shared::merge_ranges(
&word_ranges,
text,
strip_tinylang_noise,
collect_math_exclusions,
)
}
fn is_structural_command(node: Node, text: &str) -> bool {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if child.kind() == "command_name" {
let name = &text[child.start_byte()..child.end_byte()];
return STRUCTURAL_COMMANDS.contains(&name);
}
}
false
}
fn collect_prose_nodes(node: Node, text: &str, skip: bool, out: &mut Vec<(usize, usize)>) {
let kind = node.kind();
if SKIP_KINDS.contains(&kind) {
return;
}
if kind == "command" {
if skip || is_structural_command(node, text) {
return;
}
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
collect_prose_nodes(child, text, false, out);
}
return;
}
if kind == "text" {
if !skip {
let start = node.start_byte();
let end = node.end_byte();
if start < end {
out.push((start, end));
}
}
return;
}
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
collect_prose_nodes(child, text, skip, out);
}
}
fn collect_math_exclusions(gap: &str, gap_offset: usize, out: &mut Vec<(usize, usize)>) {
let bytes = gap.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
if bytes[i] != b'$' {
i += 1;
continue;
}
let exc_start = i;
if i + 1 < len && bytes[i + 1] == b'$' {
i += 2;
while i + 1 < len && !(bytes[i] == b'$' && bytes[i + 1] == b'$') {
i += 1;
}
if i + 1 < len {
i += 2;
}
} else {
i += 1;
while i < len && bytes[i] != b'$' && bytes[i] != b'\n' {
i += 1;
}
if i < len && bytes[i] == b'$' {
i += 1;
}
}
out.push((gap_offset + exc_start, gap_offset + i));
}
}
fn strip_tinylang_noise(gap: &str) -> String {
let mut result = String::new();
let chars: Vec<char> = gap.chars().collect();
let mut i = 0;
while i < chars.len() {
if chars[i] == '$' && i + 1 < chars.len() && chars[i + 1] == '$' {
i += 2;
while i + 1 < chars.len() && !(chars[i] == '$' && chars[i + 1] == '$') {
i += 1;
}
if i + 1 < chars.len() {
i += 2;
}
result.push(' ');
} else if chars[i] == '$' {
i += 1;
while i < chars.len() && chars[i] != '$' {
i += 1;
}
if i < chars.len() {
i += 1;
}
result.push(' ');
} else if chars[i] == '`' {
i += 1;
while i < chars.len() && chars[i] != '`' {
i += 1;
}
if i < chars.len() {
i += 1;
}
result.push(' ');
} else if chars[i] == '@' && i + 1 < chars.len() && chars[i + 1].is_ascii_alphabetic() {
i += 1;
while i < chars.len()
&& (chars[i].is_ascii_alphanumeric() || chars[i] == '-' || chars[i] == '_')
{
i += 1;
}
if i < chars.len() && chars[i] == '{' {
i = shared::skip_balanced_chars(&chars, i + 1, '{', '}');
}
} else if chars[i] == '/' && i + 1 < chars.len() && chars[i + 1] == '/' {
result.push('\n');
while i < chars.len() && chars[i] != '\n' {
i += 1;
}
} else if chars[i] == '*' {
i += 1;
} else if chars[i] == '_' {
i += 1;
} else if chars[i] == '#' {
i += 1;
} else {
result.push(chars[i]);
i += 1;
}
}
result
}
#[cfg(test)]
mod tests {
use crate::prose::ProseExtractor;
use crate::prose::latex::LatexExtras;
use anyhow::Result;
#[test]
fn test_tinylang_basic_extraction() -> Result<()> {
let language: tree_sitter::Language = crate::tinylang_ts::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = "This is a simple sentence.\n";
let ranges = extractor.extract(text, "tinylang", &LatexExtras::default())?;
assert!(!ranges.is_empty(), "Should extract prose from plain text");
let prose = ranges[0].extract_text(text);
assert!(
prose.contains("simple sentence"),
"Prose should contain 'simple sentence', got: {:?}",
prose
);
Ok(())
}
#[test]
fn test_tinylang_code_excluded() -> Result<()> {
let language: tree_sitter::Language = crate::tinylang_ts::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = "Before code.\n\n~~~\nfn main() {}\n~~~\n\nAfter code.\n";
let ranges = extractor.extract(text, "tinylang", &LatexExtras::default())?;
let all_prose: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
!all_prose.contains("fn main"),
"Code block content should not appear in prose, got: {:?}",
all_prose
);
assert!(
all_prose.contains("Before code"),
"Prose before code should be extracted, got: {:?}",
all_prose
);
Ok(())
}
#[test]
fn test_tinylang_structural_commands_excluded() -> Result<()> {
let language: tree_sitter::Language = crate::tinylang_ts::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = "@author{Jane Doe}\n@date{2025-01-01}\n\nSome prose text here.\n";
let ranges = extractor.extract(text, "tinylang", &LatexExtras::default())?;
let all_prose: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
!all_prose.contains("Jane Doe"),
"Structural command args should not be in prose, got: {:?}",
all_prose
);
assert!(
all_prose.contains("prose text here"),
"Regular prose should be extracted, got: {:?}",
all_prose
);
Ok(())
}
#[test]
fn test_tinylang_math_excluded() -> Result<()> {
let language: tree_sitter::Language = crate::tinylang_ts::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = "The formula $E = mc^2$ is famous.\n";
let ranges = extractor.extract(text, "tinylang", &LatexExtras::default())?;
let all_prose: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
!all_prose.contains("mc^2"),
"Inline math should not be in prose, got: {:?}",
all_prose
);
assert!(
all_prose.contains("formula"),
"Prose around math should be extracted, got: {:?}",
all_prose
);
Ok(())
}
#[test]
fn test_tinylang_comment_excluded() -> Result<()> {
let language: tree_sitter::Language = crate::tinylang_ts::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = "Visible text.\n// This is a comment\nMore text.\n";
let ranges = extractor.extract(text, "tinylang", &LatexExtras::default())?;
let all_prose: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
!all_prose.contains("This is a comment"),
"Comments should not be in prose, got: {:?}",
all_prose
);
Ok(())
}
#[test]
fn test_tinylang_prose_command_included() -> Result<()> {
let language: tree_sitter::Language = crate::tinylang_ts::LANGUAGE.into();
let mut extractor = ProseExtractor::new(language)?;
let text = "@title{My Great Document}\n\nSome text.\n";
let ranges = extractor.extract(text, "tinylang", &LatexExtras::default())?;
let all_prose: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
all_prose.contains("Great Document"),
"Prose command args should be extracted, got: {:?}",
all_prose
);
Ok(())
}
}