use std::sync::Arc;
use julienne::{
CharacterTextSplitter, ChunkError, EmbedderHandle, RecursiveCharacterTextSplitter,
SemanticChunker, SemchunkSplitter, SentenceChunker,
};
#[test]
fn invalid_regex_is_reported_at_builder_time() {
let err = match CharacterTextSplitter::builder()
.separator_regex("[")
.chunk_size(100)
.chunk_overlap(0)
.build()
{
Ok(_) => panic!("invalid regex must fail during builder validation"),
Err(err) => err,
};
assert!(matches!(err, ChunkError::InvalidConfiguration { .. }));
}
#[test]
fn invalid_chunk_configuration_is_reported_at_builder_time() {
let err = match SemchunkSplitter::builder()
.chunk_size(10)
.chunk_overlap(10)
.build()
{
Ok(_) => panic!("overlap equal to chunk size is invalid"),
Err(err) => err,
};
assert!(matches!(err, ChunkError::InvalidConfiguration { .. }));
}
#[test]
fn builders_preserve_default_construction_paths() {
let character = CharacterTextSplitter::builder()
.separator(" ")
.chunk_size(20)
.chunk_overlap(2)
.build()
.unwrap();
let recursive = RecursiveCharacterTextSplitter::builder()
.chunk_size(20)
.chunk_overlap(2)
.build()
.unwrap();
let sentence = SentenceChunker::builder()
.chunk_size(20)
.chunk_overlap(2)
.build()
.unwrap();
let semchunk = SemchunkSplitter::builder()
.chunk_size(20)
.chunk_overlap(2)
.build()
.unwrap();
let semantic = SemanticChunker::builder()
.chunk_size(20)
.chunk_overlap(2)
.build()
.unwrap();
assert!(!character.split_text("one two three").is_empty());
assert!(!recursive.split_text("one two three").is_empty());
assert!(!sentence
.split_text("One sentence. Two sentence.")
.is_empty());
assert!(!semchunk
.split_text("One sentence. Two sentence.")
.is_empty());
assert!(!semantic
.split_text("One sentence. Two sentence.")
.is_empty());
}
#[test]
fn semantic_batch_embedder_errors_are_propagated() {
let embedder: EmbedderHandle =
Arc::new(|_inputs: &[&str]| Err(ChunkError::embedding_failure("provider unavailable")));
let chunker = SemanticChunker::builder()
.chunk_size(10_000)
.chunk_overlap(0)
.window_size(2)
.min_characters_per_sentence(1)
.embedder(embedder)
.build()
.unwrap();
let err = chunker
.try_split_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
.expect_err("embedder failure must propagate");
assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}
#[test]
fn semantic_try_chunks_uses_fallible_api_contract() {
let embedder: EmbedderHandle =
Arc::new(|_inputs: &[&str]| Err(ChunkError::embedding_failure("provider unavailable")));
let chunker = SemanticChunker::builder()
.chunk_size(10_000)
.chunk_overlap(0)
.window_size(2)
.min_characters_per_sentence(1)
.embedder(embedder)
.build()
.unwrap();
let err = chunker
.try_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
.expect_err("try_chunks must propagate fallible embedder errors");
assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}
#[test]
fn semantic_batch_embedder_rejects_empty_embedding_vectors() {
let embedder: EmbedderHandle = Arc::new(|inputs: &[&str]| Ok(vec![Vec::new(); inputs.len()]));
let chunker = SemanticChunker::builder()
.chunk_size(10_000)
.chunk_overlap(0)
.window_size(2)
.min_characters_per_sentence(1)
.embedder(embedder)
.build()
.unwrap();
let err = chunker
.try_split_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
.expect_err("empty embedding vectors must be explicit failures");
assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}
#[test]
fn semantic_batch_embedder_rejects_inconsistent_embedding_dimensions() {
let embedder: EmbedderHandle = Arc::new(|inputs: &[&str]| {
let mut embeddings = vec![vec![1.0, 0.0]; inputs.len()];
if embeddings.len() > 1 {
embeddings[1] = vec![1.0];
}
Ok(embeddings)
});
let chunker = SemanticChunker::builder()
.chunk_size(10_000)
.chunk_overlap(0)
.window_size(2)
.min_characters_per_sentence(1)
.embedder(embedder)
.build()
.unwrap();
let err = chunker
.try_split_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
.expect_err("inconsistent embedding dimensions must be explicit failures");
assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}
#[test]
fn semantic_batch_embedder_rejects_wrong_embedding_count() {
let embedder: EmbedderHandle = Arc::new(|_inputs: &[&str]| Ok(vec![vec![1.0, 0.0]]));
let chunker = SemanticChunker::builder()
.chunk_size(10_000)
.chunk_overlap(0)
.window_size(2)
.min_characters_per_sentence(1)
.embedder(embedder)
.build()
.unwrap();
let err = chunker
.try_split_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
.expect_err("wrong embedding count must be an explicit failure");
assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}
#[test]
fn semantic_batch_embedder_rejects_non_finite_embedding_values() {
let embedder: EmbedderHandle = Arc::new(|inputs: &[&str]| {
let mut embeddings = vec![vec![1.0, 0.0]; inputs.len()];
embeddings[0] = vec![f32::NAN, 0.0];
Ok(embeddings)
});
let chunker = SemanticChunker::builder()
.chunk_size(10_000)
.chunk_overlap(0)
.window_size(2)
.min_characters_per_sentence(1)
.embedder(embedder)
.build()
.unwrap();
let err = chunker
.try_split_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
.expect_err("non-finite embedding values must be explicit failures");
assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}
#[test]
fn semantic_embedding_fn_try_split_rejects_empty_embedding_vectors() {
let chunker = SemanticChunker::builder()
.chunk_size(10_000)
.chunk_overlap(0)
.window_size(1)
.min_characters_per_sentence(1)
.embedding_fn(Arc::new(|_| Vec::new()))
.build()
.unwrap();
let err = chunker
.try_split_chunks("Alpha topic here. Beta topic here. Gamma topic here.")
.expect_err("legacy embedding_fn must use the explicit fallible contract in try APIs");
assert!(matches!(err, ChunkError::EmbeddingFailure { .. }));
}
#[test]
fn semantic_batch_embedder_applies_skip_window_reconnect() {
let embedder: EmbedderHandle = Arc::new(|inputs: &[&str]| {
if inputs.len() == 2 {
return Ok(vec![vec![1.0, 0.0], vec![1.0, 0.0]]);
}
Ok(inputs
.iter()
.enumerate()
.map(|(index, _)| match index {
0 | 1 | 4 | 5 => vec![1.0, 0.0],
_ => vec![0.0, 1.0],
})
.collect())
});
let chunker = SemanticChunker::builder()
.chunk_size(10_000)
.chunk_overlap(0)
.window_size(1)
.skip_window(1)
.reconnect_similarity_threshold(0.9)
.max_aside_length(200)
.min_characters_per_sentence(1)
.embedder(embedder)
.build()
.unwrap();
let chunks = chunker
.try_split_text("Alpha one. Alpha two. Aside one. Aside two. Alpha three. Alpha four.")
.unwrap();
assert_eq!(
chunks.len(),
1,
"batch reconnect must merge across the aside"
);
assert!(chunks[0].contains("Aside one"));
assert!(chunks[0].contains("Alpha four"));
}
#[test]
fn semantic_split_text_uses_configured_batch_embedder() {
let embedder: EmbedderHandle = Arc::new(|inputs: &[&str]| {
if inputs.len() == 2 {
return Ok(vec![vec![1.0, 0.0], vec![1.0, 0.0]]);
}
Ok(inputs
.iter()
.enumerate()
.map(|(index, _)| match index {
0 | 1 | 4 | 5 => vec![1.0, 0.0],
_ => vec![0.0, 1.0],
})
.collect())
});
let chunker = SemanticChunker::builder()
.chunk_size(10_000)
.chunk_overlap(0)
.window_size(1)
.skip_window(1)
.reconnect_similarity_threshold(0.9)
.max_aside_length(200)
.min_characters_per_sentence(1)
.embedder(embedder)
.build()
.unwrap();
let chunks =
chunker.split_text("Alpha one. Alpha two. Aside one. Aside two. Alpha three. Alpha four.");
assert_eq!(
chunks.len(),
1,
"plain split_text must use the configured batch embedder"
);
assert!(chunks[0].contains("Aside one"));
assert!(chunks[0].contains("Alpha four"));
}
#[test]
fn configured_splitters_are_cloneable() {
let length_fn = Arc::new(|text: &str| text.split_whitespace().count().max(1));
let splitter = SemchunkSplitter::builder()
.chunk_size(20)
.chunk_overlap(2)
.length_fn(length_fn)
.build()
.unwrap();
let cloned = splitter.clone();
assert_eq!(
splitter.split_text("one two three"),
cloned.split_text("one two three")
);
}