code_chunker/sizer.rs
1//! Chunk-size measurement strategy.
2
3/// Measures the size of a chunk for size-budget comparisons.
4///
5/// [`CodeChunker`](crate::CodeChunker) uses a `ChunkSizer` to decide whether
6/// a node fits within `max_chunk_size` and whether to merge atomic chunks.
7/// Default: byte length via [`ByteSizer`]. Plug in a tokenizer-backed sizer
8/// to size chunks in tokens — match your embedding model's actual context
9/// limit instead of approximating with bytes.
10///
11/// `max_chunk_size` is interpreted in whatever unit the sizer returns —
12/// bytes for the default `ByteSizer`, tokens for a tokenizer-backed sizer.
13pub trait ChunkSizer: Send + Sync {
14 /// Return the size of `text` in whatever unit this sizer measures.
15 fn size(&self, text: &str) -> usize;
16}
17
18/// Default sizer: returns the byte length of the chunk text.
19#[derive(Debug, Clone, Copy, Default)]
20pub struct ByteSizer;
21
22impl ChunkSizer for ByteSizer {
23 fn size(&self, text: &str) -> usize {
24 text.len()
25 }
26}