use julienne::{HtmlChunker, MarkdownChunker, XmlChunker};
#[test]
fn markdown_chunker_preserves_headings_lists_and_fenced_code() {
let input = include_str!("fixtures/sample.md");
let chunker = MarkdownChunker::new(45, 15).unwrap();
let chunks = chunker.split_chunks(input);
assert!(chunks.iter().any(|chunk| chunk.text.starts_with("# Title")));
assert!(chunks.iter().any(|chunk| chunk.text.contains("```rust")));
for chunk in chunks {
assert_eq!(&input[chunk.start_byte..chunk.end_byte], chunk.text);
}
}
#[test]
fn markdown_chunker_preserves_large_fenced_blocks_as_units() {
let input =
"# Title\n\n```rust\nfn alpha() {}\nfn beta() {}\nfn gamma() {}\n```\n\nTail paragraph.";
let chunker = MarkdownChunker::new(24, 0).unwrap();
let chunks = chunker.split_chunks(input);
let fenced = chunks
.iter()
.find(|chunk| chunk.text.contains("fn alpha"))
.expect("fenced code block should be emitted");
assert!(fenced.text.starts_with("```rust"));
assert!(fenced.text.ends_with("```"));
assert_eq!(&input[fenced.start_byte..fenced.end_byte], fenced.text);
}
#[test]
fn markdown_chunker_handles_unclosed_fenced_blocks_as_one_block() {
let input = "# Title\n\n```python\nprint('alpha')\nprint('beta')\n";
let chunker = MarkdownChunker::new(18, 0).unwrap();
let chunks = chunker.split_chunks(input);
let fenced = chunks
.iter()
.find(|chunk| chunk.text.contains("print('alpha')"))
.expect("unclosed fenced code block should still be emitted");
assert!(fenced.text.starts_with("```python"));
assert!(fenced.text.contains("print('beta')"));
assert_eq!(&input[fenced.start_byte..fenced.end_byte], fenced.text);
}
#[test]
fn markdown_chunker_splits_adjacent_headings_and_ordered_lists() {
let input = "# One\nIntro paragraph.\n## Two\n1. first\n2. second\n## Three\nTail.";
let chunker = MarkdownChunker::new(28, 0).unwrap();
let chunks = chunker.split_chunks(input);
let texts = chunks.iter().map(|chunk| chunk.text).collect::<Vec<_>>();
assert!(texts.iter().any(|text| text.contains("# One")));
assert!(texts.iter().any(|text| text.contains("## Two")));
assert!(texts.iter().any(|text| text.contains("1. first")));
assert!(texts.iter().any(|text| text.contains("2. second")));
assert!(texts.iter().any(|text| text.contains("## Three")));
for chunk in chunks {
assert_eq!(&input[chunk.start_byte..chunk.end_byte], chunk.text);
}
}
#[test]
fn markdown_chunker_trims_blank_lines_without_losing_source_offsets() {
let input = "\n\n# Title\n\nParagraph with body.\n\n";
let chunker = MarkdownChunker::new(30, 0).unwrap();
let chunks = chunker.split_chunks(input);
assert!(chunks[0].text.starts_with("# Title"));
assert_eq!(chunks[0].start_byte, input.find("# Title").unwrap());
assert!(!chunks[0].text.starts_with('\n'));
assert!(!chunks[0].text.ends_with('\n'));
assert_eq!(
&input[chunks[0].start_byte..chunks[0].end_byte],
chunks[0].text
);
assert!(chunks[0].text.contains("Paragraph with body."));
}
#[test]
fn html_chunker_uses_block_tag_boundaries() {
let input = include_str!("fixtures/sample.html");
let chunker = HtmlChunker::new(35, 10).unwrap();
let chunks = chunker.split_chunks(input);
assert!(chunks.iter().any(|chunk| chunk.text.contains("<h1>")));
assert!(chunks
.iter()
.any(|chunk| chunk.text.contains("<li>One</li>")));
for chunk in chunks {
assert_eq!(&input[chunk.start_byte..chunk.end_byte], chunk.text);
}
}
#[test]
fn xml_chunker_uses_block_boundaries_without_extraction_scope() {
let input =
"<section><h1>Title</h1><table><tr><td>Cell</td></tr></table><pre>code</pre></section>";
let chunker = XmlChunker::new(32, 8).unwrap();
let chunks = chunker.split_chunks(input);
assert!(chunks.iter().any(|chunk| chunk.text.contains("<table>")));
assert!(chunks
.iter()
.any(|chunk| chunk.text.contains("<pre>code</pre>")));
for chunk in chunks {
assert_eq!(&input[chunk.start_byte..chunk.end_byte], chunk.text);
}
}
#[test]
fn structure_overlap_carries_previous_peer_only_when_it_fits() {
let input = "Alpha.\n\nBeta.\n\nGamma.\n\nDelta.";
let chunker = MarkdownChunker::new(16, 8).unwrap();
let chunks = chunker.split_text(input);
assert!(chunks.len() >= 2);
assert!(chunks
.windows(2)
.any(|pair| pair[0].contains("Beta") && pair[1].contains("Beta")));
}
#[cfg(feature = "code")]
#[test]
fn code_chunker_chunks_rust_and_python_ast_nodes() {
use julienne::{CodeChunker, CodeLanguage};
let rust = include_str!("fixtures/sample.rs");
let python = include_str!("fixtures/sample.py");
let rust_chunks = CodeChunker::new(CodeLanguage::Rust, 80, 20)
.unwrap()
.try_split_chunks(rust)
.unwrap();
let python_chunks = CodeChunker::new(CodeLanguage::Python, 80, 20)
.unwrap()
.try_split_chunks(python)
.unwrap();
assert!(rust_chunks
.iter()
.any(|chunk| chunk.text.contains("fn add_one")));
assert!(python_chunks
.iter()
.any(|chunk| chunk.text.contains("class Greeter")));
}
#[cfg(feature = "code")]
#[test]
fn code_chunker_reports_parser_errors() {
use julienne::{CodeChunker, CodeLanguage};
let err = CodeChunker::new(CodeLanguage::Rust, 80, 0)
.unwrap()
.try_split_chunks("fn broken( {")
.expect_err("syntax errors must be explicit");
assert!(matches!(err, julienne::ChunkError::ParseFailure { .. }));
}
#[cfg(feature = "code")]
#[test]
fn code_chunker_reports_unsupported_languages() {
use julienne::{CodeChunker, CodeLanguage};
let err = CodeChunker::new(CodeLanguage::Other("javascript"), 80, 0)
.unwrap()
.try_split_chunks("function main() {}")
.expect_err("unsupported languages must be explicit");
assert!(matches!(
err,
julienne::ChunkError::UnsupportedLanguage { .. }
));
}
#[cfg(feature = "code")]
#[test]
fn code_chunker_reports_oversized_leaf_nodes() {
use julienne::{CodeChunker, CodeLanguage};
let err = CodeChunker::new(CodeLanguage::Rust, 8, 0)
.unwrap()
.try_split_chunks("fn tiny() {}")
.expect_err("oversized semantic leaves must be explicit");
assert!(matches!(
err,
julienne::ChunkError::OversizedSemanticUnit { .. }
));
}
#[cfg(feature = "code")]
#[test]
fn code_chunker_preserves_attached_comments_and_methods() {
use julienne::{CodeChunker, CodeLanguage};
let input = "struct Greeter;\n\nimpl Greeter {\n /// Says hello.\n fn greet(&self) -> &'static str {\n \"hello\"\n }\n}\n";
let chunks = CodeChunker::new(CodeLanguage::Rust, 120, 0)
.unwrap()
.try_split_chunks(input)
.unwrap();
assert!(chunks
.iter()
.any(|chunk| chunk.text.contains("/// Says hello.") && chunk.text.contains("fn greet")));
}