use julienne::{
CharacterTextSplitter, RecursiveCharacterTextSplitter, SemchunkSplitter, SentenceChunker,
};
fn assert_offsets<'a>(input: &'a str, chunks: impl IntoIterator<Item = julienne::TextChunk<'a>>) {
for chunk in chunks {
assert_eq!(
&input[chunk.start_byte..chunk.end_byte],
chunk.text,
"chunk text must be a source slice"
);
assert_eq!(input[..chunk.start_byte].chars().count(), chunk.start_char);
assert_eq!(input[..chunk.end_byte].chars().count(), chunk.end_char);
assert_eq!(chunk.text.chars().count(), chunk.measured_length);
}
}
#[test]
fn character_offsets_survive_unicode_and_overlap() {
let input = "alpha βeta gamma δelta emoji 😀 tail";
let splitter = CharacterTextSplitter::new(" ", 18, 8);
let chunks = splitter.split_chunks(input);
assert!(chunks.len() > 1);
assert_offsets(input, chunks);
}
#[test]
fn character_regex_offsets_preserve_original_separators() {
let input = "foo bar\tbaz\nqux";
let splitter = CharacterTextSplitter::builder()
.separator_regex(r"\s+")
.chunk_size(10)
.chunk_overlap(0)
.strip_whitespace(true)
.build()
.unwrap();
let chunks = splitter.split_chunks(input);
assert!(chunks
.iter()
.any(|chunk| chunk.text.contains(" ") || chunk.text.contains('\t')));
assert_offsets(input, chunks);
}
#[test]
fn recursive_offsets_preserve_kept_separators() {
let input = "Intro.\n\nSection one has Unicode café.\n\nSection two ends.";
let splitter = RecursiveCharacterTextSplitter::new(28, 8);
let chunks = splitter.split_chunks(input);
assert!(chunks.len() > 1);
assert_offsets(input, chunks);
}
#[test]
fn sentence_offsets_trim_without_losing_source_coordinates() {
let input = " First sentence. Second sentence? Third sentence! ";
let chunker = SentenceChunker::builder()
.chunk_size(28)
.chunk_overlap(0)
.min_characters_per_sentence(1)
.build()
.unwrap();
let chunks = chunker.split_chunks(input);
assert!(chunks.iter().all(|chunk| !chunk.text.starts_with(' ')));
assert!(chunks.iter().all(|chunk| !chunk.text.ends_with(' ')));
assert_offsets(input, chunks);
}
#[test]
fn semchunk_offsets_cover_delimiter_reattachment_and_trimmed_input() {
let input = " Alpha. Beta, gamma; delta! Epsilon? ";
let splitter = SemchunkSplitter::new(16, 6);
let chunks = splitter.split_chunks(input);
assert!(chunks.len() > 1);
assert!(chunks.iter().all(|chunk| !chunk.text.starts_with(' ')));
assert_offsets(input, chunks);
}