use std::sync::Arc;
use julienne::SemanticChunker;
fn keyword_embedding(text: &str) -> Vec<f32> {
let lower = text.to_lowercase();
let database = ["sql", "database", "query", "vector"]
.iter()
.map(|word| lower.matches(word).count() as f32)
.sum();
let weather = ["weather", "rain", "storm", "forecast"]
.iter()
.map(|word| lower.matches(word).count() as f32)
.sum();
vec![database, weather]
}
fn main() {
let chunker = SemanticChunker::builder()
.chunk_size(500)
.chunk_overlap(50)
.window_size(2)
.min_characters_per_sentence(1)
.embedding_fn(Arc::new(keyword_embedding))
.build()
.expect("valid semantic chunker config");
let chunks = chunker.split_text(
"SQL queries use vector indexes. Database rows store embeddings. \
Weather forecasts predict rain. Storm alerts warn cities.",
);
for chunk in chunks {
println!("{chunk}");
}
}