use julienne::{
ByteSizer, CharSizer, CharacterTextSplitter, ChunkConfig, ChunkError, ChunkSizer,
FunctionSizer, SemanticChunker, SemchunkSplitter, SentenceChunker, TokenBoundaryProvider,
TokenChunker, TokenSpan, WordSizer,
};
#[test]
fn built_in_sizers_are_deterministic() {
let text = "café au lait";
assert_eq!(CharSizer.size(text), 12);
assert_eq!(ByteSizer.size(text), 13);
assert_eq!(WordSizer.size(text), 3);
}
#[test]
fn closure_sizer_adapter_preserves_custom_length_functions() {
let sizer = FunctionSizer::from(|text: &str| text.matches(',').count() + 1);
assert_eq!(sizer.size("a,b,c"), 3);
}
#[test]
fn chunk_config_type_encodes_sizer() {
let char_config: ChunkConfig<CharSizer> = ChunkConfig::new(100, 10, CharSizer);
let word_config: ChunkConfig<WordSizer> = ChunkConfig::new(30, 5, WordSizer);
assert_eq!(char_config.sizer.size("one two"), 7);
assert_eq!(word_config.sizer.size("one two"), 2);
}
#[test]
fn splitters_type_encode_active_sizer() {
let character: CharacterTextSplitter<WordSizer> = CharacterTextSplitter::builder()
.separator(" ")
.chunk_size(3)
.chunk_overlap(1)
.sizer(WordSizer)
.build()
.unwrap();
let sentence: SentenceChunker<WordSizer> = SentenceChunker::builder()
.chunk_size(4)
.chunk_overlap(2)
.min_characters_per_sentence(1)
.sizer(WordSizer)
.build()
.unwrap();
let semchunk: SemchunkSplitter<WordSizer> = SemchunkSplitter::builder()
.chunk_size(4)
.chunk_overlap(1)
.sizer(WordSizer)
.build()
.unwrap();
let semantic: SemanticChunker<WordSizer> = SemanticChunker::builder()
.chunk_size(4)
.chunk_overlap(2)
.window_size(0)
.min_characters_per_sentence(1)
.sizer(WordSizer)
.build()
.unwrap();
assert_eq!(
character.split_text("one two three four"),
vec!["one two three", "three four"]
);
assert_eq!(
sentence.split_text("One two. Three four. Five six."),
vec!["One two. Three four.", "Three four. Five six."]
);
assert_eq!(
semchunk.split_text("one two three four five"),
vec!["one two three four", "four five"]
);
assert_eq!(
semantic.split_text("One two. Three four. Five six."),
vec!["One two. Three four.", "Three four. Five six."]
);
}
#[derive(Clone)]
struct WhitespaceTokenProvider;
impl TokenBoundaryProvider for WhitespaceTokenProvider {
fn token_spans(&self, input: &str) -> Result<Vec<TokenSpan>, julienne::ChunkError> {
let mut spans = Vec::new();
let mut in_token = None;
for (idx, ch) in input.char_indices() {
if ch.is_whitespace() {
if let Some(start) = in_token.take() {
spans.push(TokenSpan {
start_byte: start,
end_byte: idx,
});
}
} else if in_token.is_none() {
in_token = Some(idx);
}
}
if let Some(start) = in_token {
spans.push(TokenSpan {
start_byte: start,
end_byte: input.len(),
});
}
Ok(spans)
}
}
#[derive(Clone)]
struct InvalidTokenProvider;
impl TokenBoundaryProvider for InvalidTokenProvider {
fn token_spans(&self, _input: &str) -> Result<Vec<TokenSpan>, ChunkError> {
Ok(vec![TokenSpan {
start_byte: 1,
end_byte: 99,
}])
}
}
#[derive(Clone)]
struct EmptyTokenProvider;
impl TokenBoundaryProvider for EmptyTokenProvider {
fn token_spans(&self, _input: &str) -> Result<Vec<TokenSpan>, ChunkError> {
Ok(vec![TokenSpan {
start_byte: 0,
end_byte: 0,
}])
}
}
#[derive(Clone)]
struct NonBoundaryTokenProvider;
impl TokenBoundaryProvider for NonBoundaryTokenProvider {
fn token_spans(&self, _input: &str) -> Result<Vec<TokenSpan>, ChunkError> {
Ok(vec![TokenSpan {
start_byte: 1,
end_byte: 2,
}])
}
}
#[derive(Clone)]
struct OverlappingTokenProvider;
impl TokenBoundaryProvider for OverlappingTokenProvider {
fn token_spans(&self, _input: &str) -> Result<Vec<TokenSpan>, ChunkError> {
Ok(vec![
TokenSpan {
start_byte: 0,
end_byte: 3,
},
TokenSpan {
start_byte: 2,
end_byte: 4,
},
])
}
}
#[derive(Clone)]
struct FailingTokenProvider;
impl TokenBoundaryProvider for FailingTokenProvider {
fn token_spans(&self, _input: &str) -> Result<Vec<TokenSpan>, ChunkError> {
Err(ChunkError::invalid_configuration("tokenizer unavailable"))
}
}
#[test]
fn token_chunker_uses_exact_token_windows_and_overlap() {
let chunker = TokenChunker::new(WhitespaceTokenProvider, 3, 1).unwrap();
let input = "one two three four five six";
let chunks = chunker.try_split_chunks(input).unwrap();
assert_eq!(
chunks.iter().map(|chunk| chunk.text).collect::<Vec<_>>(),
vec!["one two three", "three four five", "five six"]
);
assert_eq!(
chunks
.iter()
.map(|chunk| chunk.measured_length)
.collect::<Vec<_>>(),
vec![3, 3, 2]
);
for chunk in chunks {
assert_eq!(&input[chunk.start_byte..chunk.end_byte], chunk.text);
}
}
#[test]
fn token_chunker_rejects_invalid_provider_spans() {
let chunker = TokenChunker::new(InvalidTokenProvider, 2, 0).unwrap();
let err = chunker
.try_split_chunks("café")
.expect_err("invalid provider spans must be explicit errors");
assert!(matches!(err, ChunkError::InvalidConfiguration { .. }));
}
#[test]
fn token_chunker_rejects_empty_token_spans() {
let chunker = TokenChunker::new(EmptyTokenProvider, 2, 0).unwrap();
let err = chunker
.try_split_chunks("cafe")
.expect_err("empty token spans must be explicit errors");
assert!(matches!(err, ChunkError::InvalidConfiguration { .. }));
}
#[test]
fn token_chunker_rejects_non_utf8_boundary_spans() {
let chunker = TokenChunker::new(NonBoundaryTokenProvider, 2, 0).unwrap();
let err = chunker
.try_split_chunks("éclair")
.expect_err("token spans must be on UTF-8 boundaries");
assert!(matches!(err, ChunkError::InvalidConfiguration { .. }));
}
#[test]
fn token_chunker_rejects_overlapping_spans() {
let chunker = TokenChunker::new(OverlappingTokenProvider, 2, 0).unwrap();
let err = chunker
.try_split_chunks("cafe")
.expect_err("overlapping token spans must be explicit errors");
assert!(matches!(err, ChunkError::InvalidConfiguration { .. }));
}
#[test]
fn token_chunker_propagates_provider_errors() {
let chunker = TokenChunker::new(FailingTokenProvider, 2, 0).unwrap();
let err = chunker
.try_split_chunks("cafe")
.expect_err("provider failures must propagate");
assert!(matches!(err, ChunkError::InvalidConfiguration { .. }));
}
#[cfg(feature = "tiktoken-rs")]
#[test]
fn tiktoken_provider_returns_source_aligned_token_spans() {
use julienne::token::tiktoken::TiktokenBoundaryProvider;
let provider = TiktokenBoundaryProvider::new(tiktoken_rs::cl100k_base().unwrap());
let input = "hello world";
let spans = provider.token_spans(input).unwrap();
assert!(!spans.is_empty());
for span in spans {
assert_eq!(
&input[span.start_byte..span.end_byte],
span_text(input, span)
);
}
}
#[cfg(feature = "tokenizers")]
#[test]
fn huggingface_provider_returns_source_aligned_token_spans() {
use julienne::token::huggingface::HuggingFaceBoundaryProvider;
use tokenizers::models::wordlevel::WordLevel;
use tokenizers::pre_tokenizers::whitespace::WhitespaceSplit;
use tokenizers::Tokenizer;
let vocab = [
("[UNK]".to_string(), 0),
("hello".to_string(), 1),
("world".to_string(), 2),
]
.into_iter()
.collect();
let model = WordLevel::builder()
.vocab(vocab)
.unk_token("[UNK]".to_string())
.build()
.unwrap();
let mut tokenizer = Tokenizer::new(model);
tokenizer.with_pre_tokenizer(Some(WhitespaceSplit));
let provider = HuggingFaceBoundaryProvider::new(tokenizer);
let input = "hello world";
let spans = provider.token_spans(input).unwrap();
assert_eq!(spans.len(), 2);
assert_eq!(&input[spans[0].start_byte..spans[0].end_byte], "hello");
assert_eq!(&input[spans[1].start_byte..spans[1].end_byte], "world");
}
#[cfg(feature = "tiktoken-rs")]
fn span_text(input: &str, span: TokenSpan) -> &str {
&input[span.start_byte..span.end_byte]
}
#[cfg(feature = "unicode-segmentation")]
#[test]
fn unicode_segmentation_sizers_are_feature_gated() {
use julienne::{GraphemeSizer, UnicodeWordSizer};
assert_eq!(GraphemeSizer.size("e\u{301}"), 1);
assert_eq!(UnicodeWordSizer.size("hello, 世界"), 3);
}