fn main() {
let text = "\
The solar system formed approximately 4.6 billion years ago from the gravitational \
collapse of a giant molecular cloud. The vast majority of the system's mass is in \
the Sun, with most of the remaining mass contained in Jupiter. The four inner planets \
— Mercury, Venus, Earth, and Mars — are terrestrial planets composed primarily of \
rock and metal. The four outer planets are giant planets, being substantially more \
massive than the terrestrials. The two largest, Jupiter and Saturn, are gas giants \
composed mainly of hydrogen and helium. The two outermost planets, Uranus and Neptune, \
are ice giants composed largely of substances with relatively high melting points \
compared with hydrogen and helium. All eight planets have roughly circular orbits \
that lie near the plane of Earth's orbit, called the ecliptic.";
let max_tokens = 40;
let overlap_tokens = 10;
let no_overlap = chunkedrs::chunk(text).max_tokens(max_tokens).split();
let with_overlap = chunkedrs::chunk(text)
.max_tokens(max_tokens)
.overlap(overlap_tokens)
.split();
println!("max_tokens: {max_tokens}, overlap: {overlap_tokens}\n");
println!(
"without overlap: {} chunks | with overlap: {} chunks\n",
no_overlap.len(),
with_overlap.len()
);
println!("--- chunks with overlap ---\n");
for chunk in &with_overlap {
println!(
"chunk {} | tokens: {:>2} | bytes: {}..{}",
chunk.index, chunk.token_count, chunk.start_byte, chunk.end_byte,
);
let preview: String = chunk.content.chars().take(80).collect();
let ellipsis = if chunk.content.len() > 80 { "..." } else { "" };
println!(" \"{preview}{ellipsis}\"\n");
}
println!("--- overlap verification ---\n");
for i in 0..with_overlap.len().saturating_sub(1) {
let current = &with_overlap[i];
let next = &with_overlap[i + 1];
if next.start_byte < current.end_byte {
let shared_bytes = current.end_byte - next.start_byte;
println!(
"chunks {} and {} share {} bytes (bytes {}..{})",
i,
i + 1,
shared_bytes,
next.start_byte,
current.end_byte,
);
let shared = &text[next.start_byte..current.end_byte];
let preview: String = shared.chars().take(60).collect();
println!(" shared: \"{preview}\"\n");
}
}
}