julienne 0.1.0

Range-preserving Rust text chunkers for retrieval and embedding pipelines
Documentation
use std::sync::Arc;

use julienne::SemanticChunker;

fn keyword_embedding(text: &str) -> Vec<f32> {
    let lower = text.to_lowercase();
    let database = ["sql", "database", "query", "vector"]
        .iter()
        .map(|word| lower.matches(word).count() as f32)
        .sum();
    let weather = ["weather", "rain", "storm", "forecast"]
        .iter()
        .map(|word| lower.matches(word).count() as f32)
        .sum();
    vec![database, weather]
}

fn main() {
    let chunker = SemanticChunker::builder()
        .chunk_size(500)
        .chunk_overlap(50)
        .window_size(2)
        .min_characters_per_sentence(1)
        .embedding_fn(Arc::new(keyword_embedding))
        .build()
        .expect("valid semantic chunker config");

    let chunks = chunker.split_text(
        "SQL queries use vector indexes. Database rows store embeddings. \
         Weather forecasts predict rain. Storm alerts warn cities.",
    );

    for chunk in chunks {
        println!("{chunk}");
    }
}