use tree_sitter::Node;
use super::ProseRange;
const SKIP_DIRECTIVE_TYPES: &[&str] = &[
"code-block",
"code",
"sourcecode",
"math",
"raw",
"csv-table",
"include",
"image",
"figure",
"toctree",
"only",
"highlight",
"literalinclude",
];
pub fn extract(text: &str, root: Node) -> Vec<ProseRange> {
let mut ranges = Vec::new();
collect_prose_ranges(root, text, &mut ranges, false);
ranges
}
fn collect_prose_ranges(node: Node, text: &str, out: &mut Vec<ProseRange>, skip: bool) {
let kind = node.kind();
if kind == "paragraph" || kind == "title" {
if !skip {
let mut exclusions = Vec::new();
collect_exclusions(node, &mut exclusions);
out.push(ProseRange {
start_byte: node.start_byte(),
end_byte: node.end_byte(),
exclusions,
});
}
return;
}
if kind == "directive" {
let should_skip = is_skip_directive(node, text);
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
collect_prose_ranges(child, text, out, should_skip);
}
return;
}
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
collect_prose_ranges(child, text, out, skip);
}
}
fn is_skip_directive(node: Node, text: &str) -> bool {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if child.kind() == "type" {
let name = &text[child.start_byte()..child.end_byte()];
return SKIP_DIRECTIVE_TYPES.contains(&name);
}
}
false
}
fn collect_exclusions(node: Node, out: &mut Vec<(usize, usize)>) {
let kind = node.kind();
if kind == "literal" || kind == "interpreted_text" {
out.push((node.start_byte(), node.end_byte()));
return;
}
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
collect_exclusions(child, out);
}
}
#[cfg(test)]
mod tests {
use crate::prose::ProseExtractor;
use crate::prose::latex::LatexExtras;
use anyhow::Result;
fn rst_extractor() -> Result<ProseExtractor> {
let language: tree_sitter::Language = tree_sitter_rst::LANGUAGE.into();
ProseExtractor::new(language)
}
#[test]
fn test_rst_basic_extraction() -> Result<()> {
let mut extractor = rst_extractor()?;
let text = "My Title\n========\n\nThis is a paragraph.\n";
let ranges = extractor.extract(text, "rst", &LatexExtras::default())?;
let all_prose: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
all_prose.contains("My Title"),
"Title should be extracted, got: {all_prose:?}"
);
assert!(
all_prose.contains("This is a paragraph"),
"Paragraph should be extracted, got: {all_prose:?}"
);
Ok(())
}
#[test]
fn test_rst_code_block_excluded() -> Result<()> {
let mut extractor = rst_extractor()?;
let text =
"Some text.\n\n.. code-block:: python\n\n def hello():\n pass\n\nMore text.\n";
let ranges = extractor.extract(text, "rst", &LatexExtras::default())?;
let all_prose: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
all_prose.contains("Some text"),
"Paragraph before code should be extracted, got: {all_prose:?}"
);
assert!(
all_prose.contains("More text"),
"Paragraph after code should be extracted, got: {all_prose:?}"
);
assert!(
!all_prose.contains("def hello"),
"Code block content should not be in prose, got: {all_prose:?}"
);
Ok(())
}
#[test]
fn test_rst_math_excluded() -> Result<()> {
let mut extractor = rst_extractor()?;
let text = "Before math.\n\n.. math::\n\n E = mc^2\n\nAfter math.\n";
let ranges = extractor.extract(text, "rst", &LatexExtras::default())?;
let all_prose: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
all_prose.contains("Before math"),
"Paragraph before math should be extracted, got: {all_prose:?}"
);
assert!(
!all_prose.contains("mc^2"),
"Math directive content should not be in prose, got: {all_prose:?}"
);
Ok(())
}
#[test]
fn test_rst_inline_code_excluded() -> Result<()> {
let mut extractor = rst_extractor()?;
let text = "Use ``some_function()`` to do things.\n";
let ranges = extractor.extract(text, "rst", &LatexExtras::default())?;
let all_prose: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
all_prose.contains("Use"),
"Text around inline code should be extracted, got: {all_prose:?}"
);
assert!(
!all_prose.contains("some_function"),
"Inline code should be excluded, got: {all_prose:?}"
);
Ok(())
}
#[test]
fn test_rst_list_items_extracted() -> Result<()> {
let mut extractor = rst_extractor()?;
let text = "- First item\n- Second item\n";
let ranges = extractor.extract(text, "rst", &LatexExtras::default())?;
let all_prose: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
all_prose.contains("First item"),
"List items should be extracted, got: {all_prose:?}"
);
assert!(
all_prose.contains("Second item"),
"List items should be extracted, got: {all_prose:?}"
);
Ok(())
}
}