use tree_sitter::Node;
use super::ProseRange;
use super::latex::{self, LatexExtras};
fn preprocess(text: &str) -> String {
let mut result = text.to_string();
let bytes = unsafe { result.as_bytes_mut() };
let len = bytes.len();
let mut i = 0;
while i < len {
let at_line_start = i == 0 || bytes[i - 1] == b'\n';
if at_line_start && i + 1 < len && bytes[i] == b'<' && bytes[i + 1] == b'<' {
let line_end = memchr_newline(bytes, i);
let line = &bytes[i..line_end];
if is_chunk_start(line) {
let chunk_start = i;
i = line_end;
if i < len && bytes[i] == b'\n' {
i += 1;
}
loop {
if i >= len {
break;
}
let next_line_end = memchr_newline(bytes, i);
let next_line = &bytes[i..next_line_end];
let is_end = is_chunk_end(next_line);
if is_end {
i = next_line_end;
if i < len && bytes[i] == b'\n' {
i += 1;
}
break;
}
i = next_line_end;
if i < len && bytes[i] == b'\n' {
i += 1;
}
}
for b in &mut bytes[chunk_start..i] {
if *b != b'\n' {
*b = b' ';
}
}
continue;
}
}
i += 1;
}
result
}
fn memchr_newline(bytes: &[u8], start: usize) -> usize {
let mut j = start;
while j < bytes.len() && bytes[j] != b'\n' {
j += 1;
}
j
}
fn is_chunk_start(line: &[u8]) -> bool {
if line.len() < 4 || line[0] != b'<' || line[1] != b'<' {
return false;
}
let mut i = 2;
while i + 2 < line.len() {
if line[i] == b'>' && line[i + 1] == b'>' && line[i + 2] == b'=' {
let rest = &line[i + 3..];
return rest.iter().all(|&b| b == b' ' || b == b'\t' || b == b'\r');
}
i += 1;
}
if line.len() >= 3 {
let tail = &line[line.len() - 3..];
if tail == b">>=" {
return true;
}
}
false
}
fn is_chunk_end(line: &[u8]) -> bool {
if line.is_empty() || line[0] != b'@' {
return false;
}
line[1..]
.iter()
.all(|&b| b == b' ' || b == b'\t' || b == b'\r')
}
pub fn extract(text: &str, root: Node, latex_extras: &LatexExtras) -> Vec<ProseRange> {
let preprocessed = preprocess(text);
let mut parser = tree_sitter::Parser::new();
let language: tree_sitter::Language = codebook_tree_sitter_latex::LANGUAGE.into();
parser.set_language(&language).expect("latex grammar");
let tree = parser
.parse(&preprocessed, None)
.expect("parse preprocessed sweave text");
let new_root = tree.root_node();
let _ = root;
latex::extract(&preprocessed, new_root, latex_extras)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::prose::ProseExtractor;
use anyhow::Result;
#[test]
fn test_preprocess_blanks_r_chunks() {
let text = "before\n<<setup>>=\nlibrary(x)\n@\nafter\n";
let result = preprocess(text);
assert!(result.contains("before"));
assert!(result.contains("after"));
assert!(!result.contains("library"));
assert!(!result.contains("<<setup>>="));
assert_eq!(text.len(), result.len(), "byte length must be preserved");
}
#[test]
fn test_preprocess_multiple_chunks() {
let text = "text1\n<<a>>=\ncode1\n@\ntext2\n<<b>>=\ncode2\n@\ntext3\n";
let result = preprocess(text);
assert!(result.contains("text1"));
assert!(result.contains("text2"));
assert!(result.contains("text3"));
assert!(!result.contains("code1"));
assert!(!result.contains("code2"));
}
#[test]
fn test_preprocess_chunk_with_options() {
let text = "start\n<<my-chunk, echo=FALSE, fig=TRUE>>=\nplot(x)\n@\nend\n";
let result = preprocess(text);
assert!(result.contains("start"));
assert!(result.contains("end"));
assert!(!result.contains("plot"));
assert!(!result.contains("echo=FALSE"));
}
#[test]
fn test_preprocess_preserves_newlines() {
let text = "line1\n<<a>>=\ncode\n@\nline2\n";
let result = preprocess(text);
let orig_newlines = text.chars().filter(|&c| c == '\n').count();
let result_newlines = result.chars().filter(|&c| c == '\n').count();
assert_eq!(orig_newlines, result_newlines, "newline count must match");
}
#[test]
fn test_is_chunk_start() {
assert!(is_chunk_start(b"<<>>="));
assert!(is_chunk_start(b"<<setup>>="));
assert!(is_chunk_start(b"<<my-chunk, echo=FALSE>>="));
assert!(is_chunk_start(b"<<a>>= "));
assert!(!is_chunk_start(b"<< not closed"));
assert!(!is_chunk_start(b"regular text"));
assert!(!is_chunk_start(b""));
}
#[test]
fn test_is_chunk_end() {
assert!(is_chunk_end(b"@"));
assert!(is_chunk_end(b"@ "));
assert!(is_chunk_end(b"@\t"));
assert!(!is_chunk_end(b"@ text after"));
assert!(!is_chunk_end(b"not @"));
assert!(!is_chunk_end(b""));
}
#[test]
fn test_preprocess_no_chunks() {
let text = "Just regular LaTeX content.\n\\section{Hello}\nSome text.\n";
let result = preprocess(text);
assert_eq!(text, result, "text without R chunks should be unchanged");
}
fn sweave_extractor() -> Result<ProseExtractor> {
let language = crate::languages::resolve_ts_language("sweave");
ProseExtractor::new(language)
}
#[test]
fn test_sweave_basic_extraction() -> Result<()> {
let mut extractor = sweave_extractor()?;
let text = r"\documentclass{article}
\begin{document}
This is a paragraph in a Sweave document.
<<setup, echo=FALSE>>=
library(ggplot2)
x <- rnorm(100)
@
Another paragraph after the R chunk.
\end{document}
";
let ranges = extractor.extract(text, "sweave", &LatexExtras::default())?;
let extracted: Vec<&str> = ranges
.iter()
.map(|r| &text[r.start_byte..r.end_byte])
.collect();
assert!(
extracted
.iter()
.any(|t| t.contains("paragraph in a Sweave")),
"Should extract prose text before R chunk, got: {extracted:?}"
);
assert!(
extracted
.iter()
.any(|t| t.contains("Another paragraph after")),
"Should extract prose text after R chunk, got: {extracted:?}"
);
Ok(())
}
#[test]
fn test_sweave_r_chunk_excluded() -> Result<()> {
let mut extractor = sweave_extractor()?;
let text = r"\documentclass{article}
\begin{document}
Some prose before code.
<<my-chunk, fig=TRUE>>=
plot(1:10)
summary(lm(y ~ x))
@
Some prose after code.
\end{document}
";
let ranges = extractor.extract(text, "sweave", &LatexExtras::default())?;
let all_prose: String = ranges.iter().map(|r| r.extract_text(text)).collect();
assert!(
!all_prose.contains("plot(1:10)"),
"R code should NOT appear in prose, got: {all_prose:?}"
);
assert!(
!all_prose.contains("summary(lm"),
"R code should NOT appear in prose, got: {all_prose:?}"
);
assert!(
all_prose.contains("prose before code"),
"Prose before R chunk should be extracted, got: {all_prose:?}"
);
assert!(
all_prose.contains("prose after code"),
"Prose after R chunk should be extracted, got: {all_prose:?}"
);
Ok(())
}
}