julienne 0.1.0

Range-preserving Rust text chunkers for retrieval and embedding pipelines
Documentation
use std::sync::Arc;

use julienne::{
    CharacterTextSplitter, ChunkError, EmbedderHandle, RecursiveCharacterTextSplitter,
    SemanticChunker, SemchunkSplitter, SentenceChunker,
};

#[test]
fn invalid_regex_is_reported_at_builder_time() {
    let err = match CharacterTextSplitter::builder()
        .separator_regex("[")
        .chunk_size(100)
        .chunk_overlap(0)
        .build()
    {
        Ok(_) => panic!("invalid regex must fail during builder validation"),
        Err(err) => err,
    };

    assert!(matches!(err, ChunkError::InvalidConfiguration { .. }));
}

#[test]
fn invalid_chunk_configuration_is_reported_at_builder_time() {
    let err = match SemchunkSplitter::builder()
        .chunk_size(10)
        .chunk_overlap(10)
        .build()
    {
        Ok(_) => panic!("overlap equal to chunk size is invalid"),
        Err(err) => err,
    };

    assert!(matches!(err, ChunkError::InvalidConfiguration { .. }));
}

#[test]
fn builders_preserve_default_construction_paths() {
    let character = CharacterTextSplitter::builder()
        .separator(" ")
        .chunk_size(20)
        .chunk_overlap(2)
        .build()
        .unwrap();
    let recursive = RecursiveCharacterTextSplitter::builder()
        .chunk_size(20)
        .chunk_overlap(2)
        .build()
        .unwrap();
    let sentence = SentenceChunker::builder()
        .chunk_size(20)
        .chunk_overlap(2)
        .build()
        .unwrap();
    let semchunk = SemchunkSplitter::builder()
        .chunk_size(20)
        .chunk_overlap(2)
        .build()
        .unwrap();
    let semantic = SemanticChunker::builder()
        .chunk_size(20)
        .chunk_overlap(2)
        .build()
        .unwrap();

    assert!(!character.split_text("one two three").is_empty());
    assert!(!recursive.split_text("one two three").is_empty());
    assert!(!sentence
        .split_text("One sentence. Two sentence.")
        .is_empty());
    assert!(!semchunk
        .split_text("One sentence. Two sentence.")
        .is_empty());
    assert!(!semantic
        .split_text("One sentence. Two sentence.")
        .is_empty());
}

#[test]
fn semantic_batch_embedder_errors_are_propagated() {
    let embedder: EmbedderHandle =
        Arc::new(|_inputs: &[&str]| Err(ChunkError::embedding_failure("provider unavailable")));
    let chunker = SemanticChunker::builder()
        .chunk_size(10_000)
        .chunk_overlap(0)
        .window_size(2)
        .min_characters_per_sentence(1)
        .embedder(embedder)
        .build()
        .unwrap();

    let err = chunker
        .try_split_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
        .expect_err("embedder failure must propagate");

    assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}

#[test]
fn semantic_try_chunks_uses_fallible_api_contract() {
    let embedder: EmbedderHandle =
        Arc::new(|_inputs: &[&str]| Err(ChunkError::embedding_failure("provider unavailable")));
    let chunker = SemanticChunker::builder()
        .chunk_size(10_000)
        .chunk_overlap(0)
        .window_size(2)
        .min_characters_per_sentence(1)
        .embedder(embedder)
        .build()
        .unwrap();

    let err = chunker
        .try_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
        .expect_err("try_chunks must propagate fallible embedder errors");

    assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}

#[test]
fn semantic_batch_embedder_rejects_empty_embedding_vectors() {
    let embedder: EmbedderHandle = Arc::new(|inputs: &[&str]| Ok(vec![Vec::new(); inputs.len()]));
    let chunker = SemanticChunker::builder()
        .chunk_size(10_000)
        .chunk_overlap(0)
        .window_size(2)
        .min_characters_per_sentence(1)
        .embedder(embedder)
        .build()
        .unwrap();

    let err = chunker
        .try_split_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
        .expect_err("empty embedding vectors must be explicit failures");

    assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}

#[test]
fn semantic_batch_embedder_rejects_inconsistent_embedding_dimensions() {
    let embedder: EmbedderHandle = Arc::new(|inputs: &[&str]| {
        let mut embeddings = vec![vec![1.0, 0.0]; inputs.len()];
        if embeddings.len() > 1 {
            embeddings[1] = vec![1.0];
        }
        Ok(embeddings)
    });
    let chunker = SemanticChunker::builder()
        .chunk_size(10_000)
        .chunk_overlap(0)
        .window_size(2)
        .min_characters_per_sentence(1)
        .embedder(embedder)
        .build()
        .unwrap();

    let err = chunker
        .try_split_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
        .expect_err("inconsistent embedding dimensions must be explicit failures");

    assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}

#[test]
fn semantic_batch_embedder_rejects_wrong_embedding_count() {
    let embedder: EmbedderHandle = Arc::new(|_inputs: &[&str]| Ok(vec![vec![1.0, 0.0]]));
    let chunker = SemanticChunker::builder()
        .chunk_size(10_000)
        .chunk_overlap(0)
        .window_size(2)
        .min_characters_per_sentence(1)
        .embedder(embedder)
        .build()
        .unwrap();

    let err = chunker
        .try_split_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
        .expect_err("wrong embedding count must be an explicit failure");

    assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}

#[test]
fn semantic_batch_embedder_rejects_non_finite_embedding_values() {
    let embedder: EmbedderHandle = Arc::new(|inputs: &[&str]| {
        let mut embeddings = vec![vec![1.0, 0.0]; inputs.len()];
        embeddings[0] = vec![f32::NAN, 0.0];
        Ok(embeddings)
    });
    let chunker = SemanticChunker::builder()
        .chunk_size(10_000)
        .chunk_overlap(0)
        .window_size(2)
        .min_characters_per_sentence(1)
        .embedder(embedder)
        .build()
        .unwrap();

    let err = chunker
        .try_split_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
        .expect_err("non-finite embedding values must be explicit failures");

    assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}

#[test]
fn semantic_embedding_fn_try_split_rejects_empty_embedding_vectors() {
    let chunker = SemanticChunker::builder()
        .chunk_size(10_000)
        .chunk_overlap(0)
        .window_size(1)
        .min_characters_per_sentence(1)
        .embedding_fn(Arc::new(|_| Vec::new()))
        .build()
        .unwrap();

    let err = chunker
        .try_split_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
        .expect_err("legacy embedding_fn must use the explicit fallible contract in try APIs");

    assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}

#[test]
fn semantic_batch_embedder_applies_skip_window_reconnect() {
    let embedder: EmbedderHandle = Arc::new(|inputs: &[&str]| {
        if inputs.len() == 2 {
            return Ok(vec![vec![1.0, 0.0], vec![1.0, 0.0]]);
        }

        Ok(inputs
            .iter()
            .enumerate()
            .map(|(index, _)| match index {
                0 | 1 | 4 | 5 => vec![1.0, 0.0],
                _ => vec![0.0, 1.0],
            })
            .collect())
    });
    let chunker = SemanticChunker::builder()
        .chunk_size(10_000)
        .chunk_overlap(0)
        .window_size(1)
        .skip_window(1)
        .reconnect_similarity_threshold(0.9)
        .max_aside_length(200)
        .min_characters_per_sentence(1)
        .embedder(embedder)
        .build()
        .unwrap();

    let chunks = chunker
        .try_split_text("Alpha one. Alpha two. Aside one. Aside two. Alpha three. Alpha four.")
        .unwrap();

    assert_eq!(
        chunks.len(),
        1,
        "batch reconnect must merge across the aside"
    );
    assert!(chunks[0].contains("Aside one"));
    assert!(chunks[0].contains("Alpha four"));
}

#[test]
fn semantic_split_text_uses_configured_batch_embedder() {
    let embedder: EmbedderHandle = Arc::new(|inputs: &[&str]| {
        if inputs.len() == 2 {
            return Ok(vec![vec![1.0, 0.0], vec![1.0, 0.0]]);
        }

        Ok(inputs
            .iter()
            .enumerate()
            .map(|(index, _)| match index {
                0 | 1 | 4 | 5 => vec![1.0, 0.0],
                _ => vec![0.0, 1.0],
            })
            .collect())
    });
    let chunker = SemanticChunker::builder()
        .chunk_size(10_000)
        .chunk_overlap(0)
        .window_size(1)
        .skip_window(1)
        .reconnect_similarity_threshold(0.9)
        .max_aside_length(200)
        .min_characters_per_sentence(1)
        .embedder(embedder)
        .build()
        .unwrap();

    let chunks =
        chunker.split_text("Alpha one. Alpha two. Aside one. Aside two. Alpha three. Alpha four.");

    assert_eq!(
        chunks.len(),
        1,
        "plain split_text must use the configured batch embedder"
    );
    assert!(chunks[0].contains("Aside one"));
    assert!(chunks[0].contains("Alpha four"));
}

#[test]
fn configured_splitters_are_cloneable() {
    let length_fn = Arc::new(|text: &str| text.split_whitespace().count().max(1));
    let splitter = SemchunkSplitter::builder()
        .chunk_size(20)
        .chunk_overlap(2)
        .length_fn(length_fn)
        .build()
        .unwrap();

    let cloned = splitter.clone();

    assert_eq!(
        splitter.split_text("one two three"),
        cloned.split_text("one two three")
    );
}