julienne 0.1.0

Range-preserving Rust text chunkers for retrieval and embedding pipelines
Documentation
use std::sync::Arc;

use julienne::{ChunkSizer, FunctionSizer, SemchunkSplitter};

fn main() {
    let word_sizer = FunctionSizer::from(|text: &str| text.split_whitespace().count());
    println!("custom size: {}", word_sizer.size("one two three"));

    let splitter = SemchunkSplitter::builder()
        .chunk_size(6)
        .chunk_overlap(2)
        .length_fn(Arc::new(|text: &str| text.split_whitespace().count()))
        .build()
        .expect("valid semchunk config");

    let chunks = splitter.split_text("one two three four five six seven eight");
    for chunk in chunks {
        println!("{chunk}");
    }
}