julienne 0.1.0

Range-preserving Rust text chunkers for retrieval and embedding pipelines
Documentation
use julienne::{HtmlChunker, MarkdownChunker, XmlChunker};

#[test]
fn markdown_chunker_preserves_headings_lists_and_fenced_code() {
    let input = include_str!("fixtures/sample.md");
    let chunker = MarkdownChunker::new(45, 15).unwrap();

    let chunks = chunker.split_chunks(input);

    assert!(chunks.iter().any(|chunk| chunk.text.starts_with("# Title")));
    assert!(chunks.iter().any(|chunk| chunk.text.contains("```rust")));
    for chunk in chunks {
        assert_eq!(&input[chunk.start_byte..chunk.end_byte], chunk.text);
    }
}

#[test]
fn markdown_chunker_preserves_large_fenced_blocks_as_units() {
    let input =
        "# Title\n\n```rust\nfn alpha() {}\nfn beta() {}\nfn gamma() {}\n```\n\nTail paragraph.";
    let chunker = MarkdownChunker::new(24, 0).unwrap();

    let chunks = chunker.split_chunks(input);

    let fenced = chunks
        .iter()
        .find(|chunk| chunk.text.contains("fn alpha"))
        .expect("fenced code block should be emitted");
    assert!(fenced.text.starts_with("```rust"));
    assert!(fenced.text.ends_with("```"));
    assert_eq!(&input[fenced.start_byte..fenced.end_byte], fenced.text);
}

#[test]
fn markdown_chunker_handles_unclosed_fenced_blocks_as_one_block() {
    let input = "# Title\n\n```python\nprint('alpha')\nprint('beta')\n";
    let chunker = MarkdownChunker::new(18, 0).unwrap();

    let chunks = chunker.split_chunks(input);

    let fenced = chunks
        .iter()
        .find(|chunk| chunk.text.contains("print('alpha')"))
        .expect("unclosed fenced code block should still be emitted");
    assert!(fenced.text.starts_with("```python"));
    assert!(fenced.text.contains("print('beta')"));
    assert_eq!(&input[fenced.start_byte..fenced.end_byte], fenced.text);
}

#[test]
fn markdown_chunker_splits_adjacent_headings_and_ordered_lists() {
    let input = "# One\nIntro paragraph.\n## Two\n1. first\n2. second\n## Three\nTail.";
    let chunker = MarkdownChunker::new(28, 0).unwrap();

    let chunks = chunker.split_chunks(input);
    let texts = chunks.iter().map(|chunk| chunk.text).collect::<Vec<_>>();

    assert!(texts.iter().any(|text| text.contains("# One")));
    assert!(texts.iter().any(|text| text.contains("## Two")));
    assert!(texts.iter().any(|text| text.contains("1. first")));
    assert!(texts.iter().any(|text| text.contains("2. second")));
    assert!(texts.iter().any(|text| text.contains("## Three")));
    for chunk in chunks {
        assert_eq!(&input[chunk.start_byte..chunk.end_byte], chunk.text);
    }
}

#[test]
fn markdown_chunker_trims_blank_lines_without_losing_source_offsets() {
    let input = "\n\n# Title\n\nParagraph with body.\n\n";
    let chunker = MarkdownChunker::new(30, 0).unwrap();

    let chunks = chunker.split_chunks(input);

    assert!(chunks[0].text.starts_with("# Title"));
    assert_eq!(chunks[0].start_byte, input.find("# Title").unwrap());
    assert!(!chunks[0].text.starts_with('\n'));
    assert!(!chunks[0].text.ends_with('\n'));
    assert_eq!(
        &input[chunks[0].start_byte..chunks[0].end_byte],
        chunks[0].text
    );
    assert!(chunks[0].text.contains("Paragraph with body."));
}

#[test]
fn html_chunker_uses_block_tag_boundaries() {
    let input = include_str!("fixtures/sample.html");
    let chunker = HtmlChunker::new(35, 10).unwrap();

    let chunks = chunker.split_chunks(input);

    assert!(chunks.iter().any(|chunk| chunk.text.contains("<h1>")));
    assert!(chunks
        .iter()
        .any(|chunk| chunk.text.contains("<li>One</li>")));
    for chunk in chunks {
        assert_eq!(&input[chunk.start_byte..chunk.end_byte], chunk.text);
    }
}

#[test]
fn xml_chunker_uses_block_boundaries_without_extraction_scope() {
    let input =
        "<section><h1>Title</h1><table><tr><td>Cell</td></tr></table><pre>code</pre></section>";
    let chunker = XmlChunker::new(32, 8).unwrap();

    let chunks = chunker.split_chunks(input);

    assert!(chunks.iter().any(|chunk| chunk.text.contains("<table>")));
    assert!(chunks
        .iter()
        .any(|chunk| chunk.text.contains("<pre>code</pre>")));
    for chunk in chunks {
        assert_eq!(&input[chunk.start_byte..chunk.end_byte], chunk.text);
    }
}

#[test]
fn structure_overlap_carries_previous_peer_only_when_it_fits() {
    let input = "Alpha.\n\nBeta.\n\nGamma.\n\nDelta.";
    let chunker = MarkdownChunker::new(16, 8).unwrap();

    let chunks = chunker.split_text(input);

    assert!(chunks.len() >= 2);
    assert!(chunks
        .windows(2)
        .any(|pair| pair[0].contains("Beta") && pair[1].contains("Beta")));
}

#[cfg(feature = "code")]
#[test]
fn code_chunker_chunks_rust_and_python_ast_nodes() {
    use julienne::{CodeChunker, CodeLanguage};

    let rust = include_str!("fixtures/sample.rs");
    let python = include_str!("fixtures/sample.py");

    let rust_chunks = CodeChunker::new(CodeLanguage::Rust, 80, 20)
        .unwrap()
        .try_split_chunks(rust)
        .unwrap();
    let python_chunks = CodeChunker::new(CodeLanguage::Python, 80, 20)
        .unwrap()
        .try_split_chunks(python)
        .unwrap();

    assert!(rust_chunks
        .iter()
        .any(|chunk| chunk.text.contains("fn add_one")));
    assert!(python_chunks
        .iter()
        .any(|chunk| chunk.text.contains("class Greeter")));
}

#[cfg(feature = "code")]
#[test]
fn code_chunker_reports_parser_errors() {
    use julienne::{CodeChunker, CodeLanguage};

    let err = CodeChunker::new(CodeLanguage::Rust, 80, 0)
        .unwrap()
        .try_split_chunks("fn broken( {")
        .expect_err("syntax errors must be explicit");

    assert!(matches!(err, julienne::ChunkError::ParseFailure { .. }));
}

#[cfg(feature = "code")]
#[test]
fn code_chunker_reports_unsupported_languages() {
    use julienne::{CodeChunker, CodeLanguage};

    let err = CodeChunker::new(CodeLanguage::Other("javascript"), 80, 0)
        .unwrap()
        .try_split_chunks("function main() {}")
        .expect_err("unsupported languages must be explicit");

    assert!(matches!(
        err,
        julienne::ChunkError::UnsupportedLanguage { .. }
    ));
}

#[cfg(feature = "code")]
#[test]
fn code_chunker_reports_oversized_leaf_nodes() {
    use julienne::{CodeChunker, CodeLanguage};

    let err = CodeChunker::new(CodeLanguage::Rust, 8, 0)
        .unwrap()
        .try_split_chunks("fn tiny() {}")
        .expect_err("oversized semantic leaves must be explicit");

    assert!(matches!(
        err,
        julienne::ChunkError::OversizedSemanticUnit { .. }
    ));
}

#[cfg(feature = "code")]
#[test]
fn code_chunker_preserves_attached_comments_and_methods() {
    use julienne::{CodeChunker, CodeLanguage};

    let input = "struct Greeter;\n\nimpl Greeter {\n    /// Says hello.\n    fn greet(&self) -> &'static str {\n        \"hello\"\n    }\n}\n";
    let chunks = CodeChunker::new(CodeLanguage::Rust, 120, 0)
        .unwrap()
        .try_split_chunks(input)
        .unwrap();

    assert!(chunks
        .iter()
        .any(|chunk| chunk.text.contains("/// Says hello.") && chunk.text.contains("fn greet")));
}