julienne 0.1.0

Range-preserving Rust text chunkers for retrieval and embedding pipelines
Documentation
use julienne::{
    CharacterTextSplitter, RecursiveCharacterTextSplitter, SemchunkSplitter, SentenceChunker,
};

fn assert_offsets<'a>(input: &'a str, chunks: impl IntoIterator<Item = julienne::TextChunk<'a>>) {
    for chunk in chunks {
        assert_eq!(
            &input[chunk.start_byte..chunk.end_byte],
            chunk.text,
            "chunk text must be a source slice"
        );
        assert_eq!(input[..chunk.start_byte].chars().count(), chunk.start_char);
        assert_eq!(input[..chunk.end_byte].chars().count(), chunk.end_char);
        assert_eq!(chunk.text.chars().count(), chunk.measured_length);
    }
}

#[test]
fn character_offsets_survive_unicode_and_overlap() {
    let input = "alpha βeta gamma δelta emoji 😀 tail";
    let splitter = CharacterTextSplitter::new(" ", 18, 8);

    let chunks = splitter.split_chunks(input);

    assert!(chunks.len() > 1);
    assert_offsets(input, chunks);
}

#[test]
fn character_regex_offsets_preserve_original_separators() {
    let input = "foo  bar\tbaz\nqux";
    let splitter = CharacterTextSplitter::builder()
        .separator_regex(r"\s+")
        .chunk_size(10)
        .chunk_overlap(0)
        .strip_whitespace(true)
        .build()
        .unwrap();

    let chunks = splitter.split_chunks(input);

    assert!(chunks
        .iter()
        .any(|chunk| chunk.text.contains("  ") || chunk.text.contains('\t')));
    assert_offsets(input, chunks);
}

#[test]
fn recursive_offsets_preserve_kept_separators() {
    let input = "Intro.\n\nSection one has Unicode café.\n\nSection two ends.";
    let splitter = RecursiveCharacterTextSplitter::new(28, 8);

    let chunks = splitter.split_chunks(input);

    assert!(chunks.len() > 1);
    assert_offsets(input, chunks);
}

#[test]
fn sentence_offsets_trim_without_losing_source_coordinates() {
    let input = "  First sentence. Second sentence? Third sentence!  ";
    let chunker = SentenceChunker::builder()
        .chunk_size(28)
        .chunk_overlap(0)
        .min_characters_per_sentence(1)
        .build()
        .unwrap();

    let chunks = chunker.split_chunks(input);

    assert!(chunks.iter().all(|chunk| !chunk.text.starts_with(' ')));
    assert!(chunks.iter().all(|chunk| !chunk.text.ends_with(' ')));
    assert_offsets(input, chunks);
}

#[test]
fn semchunk_offsets_cover_delimiter_reattachment_and_trimmed_input() {
    let input = "  Alpha. Beta, gamma; delta! Epsilon?  ";
    let splitter = SemchunkSplitter::new(16, 6);

    let chunks = splitter.split_chunks(input);

    assert!(chunks.len() > 1);
    assert!(chunks.iter().all(|chunk| !chunk.text.starts_with(' ')));
    assert_offsets(input, chunks);
}