Skip to main content

solo_storage/document/
chunk.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Split a document's text into chunks for embedding.
4//!
5//! ## Strategy
6//!
7//! 1. Split the input on paragraph boundaries (`\n\n` or a Markdown-style
8//!    heading line).
9//! 2. Accumulate paragraphs into a chunk until the running token count
10//!    reaches `target_tokens`. Emit, then start the next chunk with the
11//!    last ~`overlap_tokens` worth of characters from the just-emitted
12//!    chunk (to preserve context across boundaries).
13//! 3. If a single paragraph itself exceeds `target_tokens * 1.5`, slide
14//!    a window across it, preferring to break on sentence-ending
15//!    punctuation (`.`, `!`, `?`, newline) within the last ~10% of the
16//!    window.
17//!
18//! All offsets are byte offsets into the original `text`. They MUST land
19//! on UTF-8 character boundaries — the implementation walks
20//! `text.char_indices()` to guarantee that.
21//!
22//! Token counting is approximated as `chars / 4`. This is good enough for
23//! English; for non-Latin scripts it under-estimates by ~2x, which means
24//! chunks may come out a bit larger than expected. The approximation
25//! lives in [`approx_token_count`] and is intentionally not pluggable —
26//! the writer-actor (P3) re-derives `token_count` per chunk from the
27//! same fn so there's no drift between the chunker and the persisted
28//! metadata.
29
30/// Configuration for [`chunk_text`].
31///
32/// Field defaults (500 / 50) come from the v0.7.0 plan; values were chosen
33/// to keep each chunk well under the 8K-token context of typical embedder
34/// models while still capturing a meaningful semantic unit.
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub struct ChunkConfig {
37    /// Target tokens per chunk (approximation: chars/4).
38    pub target_tokens: u32,
39    /// Tokens of overlap between consecutive chunks. Should be < target.
40    pub overlap_tokens: u32,
41}
42
43impl Default for ChunkConfig {
44    fn default() -> Self {
45        Self {
46            target_tokens: 500,
47            overlap_tokens: 50,
48        }
49    }
50}
51
52/// One chunk's specification.
53///
54/// The writer-actor (P3) materializes a `ChunkSpec` into a
55/// [`solo_core::DocumentChunk`] by allocating a fresh `ChunkId`, setting
56/// `doc_id`, assigning `chunk_index`, and stamping `created_at_ms`. Holding
57/// those concerns out of the chunker keeps it a pure function from
58/// (text, config) → list of substrings + offsets.
59#[derive(Debug, Clone, PartialEq, Eq)]
60pub struct ChunkSpec {
61    /// The chunk's text content (slice from the original document).
62    pub content: String,
63    /// Byte offset in the original document where this chunk starts.
64    pub start_offset: u32,
65    /// Byte offset in the original document where this chunk ends (exclusive).
66    pub end_offset: u32,
67    /// Approximate token count (chars/4) for this chunk's content.
68    pub token_count: u32,
69}
70
71/// Approximate token count: 1 token ≈ 4 characters (English heuristic).
72pub(crate) fn approx_token_count(text: &str) -> u32 {
73    let chars = text.chars().count();
74    // Saturating to u32 is fine — texts > 17 GB chars are out of scope.
75    u32::try_from(chars / 4).unwrap_or(u32::MAX)
76}
77
78/// Split `text` into chunks per `config`.
79///
80/// Contracts (enforced by tests):
81///
82///   * Empty input → empty `Vec`.
83///   * Text whose token count ≤ `target * 1.5` → exactly one chunk
84///     spanning the whole text.
85///   * Otherwise → N ≥ 2 chunks. Each chunk's content is a contiguous
86///     byte-slice of `text` (no synthesis). `start_offset` /
87///     `end_offset` fall on UTF-8 char boundaries.
88///   * Offsets are monotonically increasing across the returned `Vec`:
89///     for consecutive chunks `start[i+1] < end[i]` (overlap) and
90///     `end[i+1] > end[i]` (forward progress).
91pub fn chunk_text(text: &str, config: &ChunkConfig) -> Vec<ChunkSpec> {
92    if text.is_empty() {
93        return Vec::new();
94    }
95    let target = config.target_tokens.max(1);
96    let overlap = config.overlap_tokens.min(target.saturating_sub(1));
97    // A single-chunk emit if the whole text comfortably fits.
98    let total_tokens = approx_token_count(text);
99    if total_tokens <= target.saturating_mul(3) / 2 {
100        return vec![ChunkSpec {
101            content: text.to_string(),
102            start_offset: 0,
103            end_offset: u32::try_from(text.len()).unwrap_or(u32::MAX),
104            token_count: total_tokens,
105        }];
106    }
107
108    let paragraphs = split_paragraphs(text);
109    let oversize_threshold = target.saturating_mul(3) / 2;
110
111    let mut chunks: Vec<ChunkSpec> = Vec::new();
112    let mut cursor_start: usize = 0; // byte offset of the chunk currently being assembled
113    let mut cursor_end: usize = 0;   // byte offset just past the last paragraph appended
114    let mut cursor_tokens: u32 = 0;
115
116    for p in &paragraphs {
117        let p_tokens = approx_token_count(&text[p.start..p.end]);
118
119        // Oversized paragraph — flush whatever we have, then slide-window
120        // across the paragraph itself.
121        if p_tokens >= oversize_threshold {
122            if cursor_end > cursor_start {
123                push_chunk(&mut chunks, text, cursor_start, cursor_end);
124            }
125            slide_window(&mut chunks, text, p.start, p.end, target, overlap);
126            cursor_start = window_overlap_start(text, p.end, overlap);
127            cursor_end = cursor_start;
128            cursor_tokens = 0;
129            continue;
130        }
131
132        // Would adding this paragraph overshoot the target? Flush + restart.
133        // The "would overshoot" check is `cursor_tokens + p_tokens > target * 1.5`
134        // so we keep paragraphs whole when feasible.
135        if cursor_end > cursor_start && cursor_tokens + p_tokens > oversize_threshold {
136            push_chunk(&mut chunks, text, cursor_start, cursor_end);
137            cursor_start = window_overlap_start(text, cursor_end, overlap);
138            // cursor_end intentionally NOT reset here — it's overwritten
139            // unconditionally below at `cursor_end = p.end`.
140        }
141
142        // Append the paragraph to the current chunk window.
143        cursor_end = p.end;
144        cursor_tokens = approx_token_count(&text[cursor_start..cursor_end]);
145
146        // If we now sit exactly at or above target, flush — but only when we
147        // haven't already; staying within the [target/2, target*1.5] band.
148        if cursor_tokens >= target {
149            push_chunk(&mut chunks, text, cursor_start, cursor_end);
150            cursor_start = window_overlap_start(text, cursor_end, overlap);
151            cursor_end = cursor_start;
152            // cursor_tokens recomputed at the top of the next iteration if needed
153        }
154    }
155
156    // Trailing chunk: anything pending after the last paragraph.
157    if cursor_end > cursor_start {
158        push_chunk(&mut chunks, text, cursor_start, cursor_end);
159    }
160
161    chunks
162}
163
164/// A single paragraph window into the source text.
165#[derive(Debug, Clone, Copy)]
166struct Paragraph {
167    start: usize,
168    /// Exclusive byte offset; includes the trailing paragraph separator
169    /// so that consecutive paragraphs concatenate to the original text
170    /// without gaps.
171    end: usize,
172}
173
174/// Split on `\n\n` (paragraph) boundaries. Each paragraph's `[start, end)`
175/// includes any blank-line separator that immediately follows, so
176/// concatenating all paragraphs reconstructs `text` byte-for-byte.
177fn split_paragraphs(text: &str) -> Vec<Paragraph> {
178    let bytes = text.as_bytes();
179    let n = bytes.len();
180    let mut out = Vec::new();
181    let mut start = 0usize;
182    let mut i = 0usize;
183    while i < n {
184        // Find the next "\n\n" (or end-of-string).
185        if i + 1 < n && bytes[i] == b'\n' && bytes[i + 1] == b'\n' {
186            // Skip past the full run of newlines so the next paragraph
187            // doesn't start with whitespace it can never trim away (the
188            // chunker preserves byte offsets exactly).
189            let mut j = i + 2;
190            while j < n && bytes[j] == b'\n' {
191                j += 1;
192            }
193            out.push(Paragraph { start, end: j });
194            start = j;
195            i = j;
196            continue;
197        }
198        i += 1;
199    }
200    if start < n {
201        out.push(Paragraph { start, end: n });
202    }
203    out
204}
205
206/// Compute the start of an overlap window of approximately `overlap` tokens
207/// (≈ `overlap * 4` chars) ending at `end`. The returned position is
208/// guaranteed to be a UTF-8 char boundary.
209fn window_overlap_start(text: &str, end: usize, overlap: u32) -> usize {
210    if overlap == 0 || end == 0 {
211        return end;
212    }
213    let target_chars = (overlap as usize) * 4;
214    let mut count = 0usize;
215    // Iterate char-indices in reverse from `end`.
216    let prefix = &text[..end];
217    for (idx, _ch) in prefix.char_indices().rev() {
218        count += 1;
219        if count > target_chars {
220            return idx;
221        }
222    }
223    0
224}
225
226fn push_chunk(out: &mut Vec<ChunkSpec>, text: &str, start: usize, end: usize) {
227    debug_assert!(start < end, "push_chunk: empty range [{start},{end})");
228    debug_assert!(text.is_char_boundary(start), "start {start} not on char boundary");
229    debug_assert!(text.is_char_boundary(end), "end {end} not on char boundary");
230    let slice = &text[start..end];
231    out.push(ChunkSpec {
232        content: slice.to_string(),
233        start_offset: u32::try_from(start).unwrap_or(u32::MAX),
234        end_offset: u32::try_from(end).unwrap_or(u32::MAX),
235        token_count: approx_token_count(slice),
236    });
237}
238
239/// Slide a fixed-token window across `text[range_start..range_end]`,
240/// preferring sentence-ending punctuation in the last ~10% of the window.
241///
242/// Used when a single paragraph is too large to fit a single chunk.
243fn slide_window(
244    out: &mut Vec<ChunkSpec>,
245    text: &str,
246    range_start: usize,
247    range_end: usize,
248    target: u32,
249    overlap: u32,
250) {
251    let target_chars = (target as usize) * 4;
252    let mut window_start = range_start;
253    while window_start < range_end {
254        // Provisional end at target_chars; then nudge to the nearest
255        // sentence-ending punctuation if one exists in the last 10%.
256        let mut chars_seen = 0usize;
257        let mut window_end = range_end;
258        let suffix = &text[window_start..range_end];
259        for (idx, _ch) in suffix.char_indices() {
260            chars_seen += 1;
261            if chars_seen >= target_chars {
262                window_end = window_start + idx;
263                break;
264            }
265        }
266        // If the natural end is within ~10% of the target, that's fine.
267        // Otherwise, look back through the last 10% of the window for a
268        // sentence-ending punctuation char.
269        if window_end < range_end {
270            let lookback = target_chars / 10;
271            let snap = find_sentence_break(text, window_start, window_end, lookback);
272            window_end = snap;
273        }
274        if window_end <= window_start {
275            // Defensive: don't loop forever. Force at least one char.
276            let next = text[window_start..range_end]
277                .char_indices()
278                .next()
279                .map(|(_, c)| window_start + c.len_utf8())
280                .unwrap_or(range_end);
281            window_end = next;
282        }
283        push_chunk(out, text, window_start, window_end);
284        if window_end >= range_end {
285            break;
286        }
287        let next_start = window_overlap_start(text, window_end, overlap);
288        // Guarantee monotonic progress (avoid infinite loop on pathological text).
289        window_start = next_start.max(window_start + 1);
290        // Re-align to char boundary going forward — `+1` may land mid-char.
291        while window_start < range_end && !text.is_char_boundary(window_start) {
292            window_start += 1;
293        }
294    }
295}
296
297/// Within `[window_end - lookback, window_end)`, find the byte offset just
298/// past the last `.`, `!`, `?`, or `\n`. If none, return `window_end` unchanged.
299fn find_sentence_break(
300    text: &str,
301    window_start: usize,
302    window_end: usize,
303    lookback_chars: usize,
304) -> usize {
305    let bytes = text.as_bytes();
306    // Determine the byte offset of the start of the look-back region.
307    let look_start = {
308        let prefix = &text[window_start..window_end];
309        let mut count = 0usize;
310        let mut start_idx = 0usize;
311        for (idx, _ch) in prefix.char_indices().rev() {
312            count += 1;
313            if count >= lookback_chars {
314                start_idx = window_start + idx;
315                break;
316            }
317        }
318        if count < lookback_chars {
319            window_start
320        } else {
321            start_idx
322        }
323    };
324    // Walk back from window_end looking for terminal punctuation.
325    let mut i = window_end;
326    while i > look_start {
327        let prev = match text[..i].char_indices().next_back() {
328            Some((idx, _)) => idx,
329            None => break,
330        };
331        let ch = bytes[prev];
332        if ch == b'.' || ch == b'!' || ch == b'?' || ch == b'\n' {
333            return i; // include the punctuation char
334        }
335        i = prev;
336    }
337    window_end
338}
339
340// ---------------------------------------------------------------------------
341// Tests
342// ---------------------------------------------------------------------------
343
344#[cfg(test)]
345mod tests {
346    use super::*;
347
348    #[test]
349    fn chunk_empty_text_returns_empty_vec() {
350        let out = chunk_text("", &ChunkConfig::default());
351        assert!(out.is_empty());
352    }
353
354    #[test]
355    fn chunk_short_text_returns_single_chunk() {
356        // Way under target — should fit in one chunk.
357        let text = "Hello world. This is a tiny doc.";
358        let out = chunk_text(text, &ChunkConfig::default());
359        assert_eq!(out.len(), 1);
360        assert_eq!(out[0].content, text);
361        assert_eq!(out[0].start_offset, 0);
362        assert_eq!(out[0].end_offset as usize, text.len());
363    }
364
365    /// Build a multi-paragraph synthetic text large enough to force multiple
366    /// chunks. Each paragraph is ~200 chars (~50 tokens by chars/4), so
367    /// at target=500 / overlap=50 we expect roughly target/50 = 10
368    /// paragraphs per chunk → N chunks ≈ paragraph_count / 10.
369    fn synthetic_text(paragraph_count: usize, words_per_paragraph: usize) -> String {
370        let mut s = String::new();
371        for p in 0..paragraph_count {
372            for w in 0..words_per_paragraph {
373                if w > 0 {
374                    s.push(' ');
375                }
376                s.push_str(&format!("word{w:02}"));
377            }
378            s.push('.');
379            if p + 1 < paragraph_count {
380                s.push_str("\n\n");
381            }
382        }
383        s
384    }
385
386    #[test]
387    fn chunk_long_text_splits_into_multiple() {
388        // ~500 paragraphs of ~50 chars each = ~25 000 chars = ~6 250 tokens.
389        // At target=500, expect ~12+ chunks.
390        let text = synthetic_text(500, 8); // ~50 chars/paragraph
391        let cfg = ChunkConfig::default();
392        let out = chunk_text(&text, &cfg);
393        assert!(out.len() > 1, "expected multiple chunks, got {}", out.len());
394        // Every chunk's content must round-trip exactly with its offsets.
395        for c in &out {
396            let slice = &text[c.start_offset as usize..c.end_offset as usize];
397            assert_eq!(slice, c.content.as_str());
398        }
399    }
400
401    #[test]
402    fn chunk_respects_paragraph_boundaries() {
403        // A handful of well-separated paragraphs. Each is small (~30 chars)
404        // so the chunker should group them but NOT split mid-paragraph.
405        let text = synthetic_text(40, 6);
406        let cfg = ChunkConfig {
407            target_tokens: 50,
408            overlap_tokens: 5,
409        };
410        let out = chunk_text(&text, &cfg);
411        assert!(out.len() > 1);
412        // None of the chunk boundaries should fall in the middle of "wordNN"
413        // — they should land on or near \n\n boundaries.
414        for c in &out {
415            let last_char = c.content.chars().last().unwrap();
416            // The last char should be either '.' (sentence end) or '\n' or a digit
417            // (when the chunker had to cut mid-paragraph because the next
418            // paragraph would overshoot too much). For this size of synthetic
419            // input, the chunker should mostly land on sentence ends.
420            assert!(
421                last_char == '.' || last_char == '\n' || last_char.is_ascii_alphanumeric(),
422                "chunk ends mid-token at: {:?}",
423                &c.content[c.content.len().saturating_sub(20)..]
424            );
425        }
426    }
427
428    #[test]
429    fn chunk_target_size_band() {
430        // Most chunks should fall within [target/2, target*1.5] tokens.
431        let text = synthetic_text(300, 8);
432        let cfg = ChunkConfig {
433            target_tokens: 100,
434            overlap_tokens: 10,
435        };
436        let out = chunk_text(&text, &cfg);
437        assert!(out.len() >= 3, "need enough chunks to evaluate band");
438        // Excluding the last (trailing) chunk, every chunk must be in band.
439        let lower = cfg.target_tokens / 2;
440        let upper = cfg.target_tokens * 3 / 2;
441        for (i, c) in out.iter().enumerate().take(out.len() - 1) {
442            assert!(
443                c.token_count >= lower && c.token_count <= upper,
444                "chunk {i} out of band: token_count={} band=[{lower},{upper}]",
445                c.token_count,
446            );
447        }
448    }
449
450    #[test]
451    fn chunk_offsets_monotonic_with_overlap() {
452        let text = synthetic_text(200, 8);
453        let cfg = ChunkConfig {
454            target_tokens: 100,
455            overlap_tokens: 10,
456        };
457        let out = chunk_text(&text, &cfg);
458        assert!(out.len() >= 2);
459        for window in out.windows(2) {
460            let a = &window[0];
461            let b = &window[1];
462            // Forward progress
463            assert!(
464                b.end_offset > a.end_offset,
465                "end_offset must increase across chunks: {} -> {}",
466                a.end_offset,
467                b.end_offset
468            );
469            // Overlap: next chunk starts at or before previous chunk's end.
470            // With overlap_tokens=10, b.start_offset should typically be
471            // a.end_offset - (~40 chars). Allow equality for cases where
472            // the chunker can't find any overlap (rare).
473            assert!(
474                b.start_offset <= a.end_offset,
475                "next chunk should overlap or abut the previous: a.end={} b.start={}",
476                a.end_offset,
477                b.start_offset,
478            );
479        }
480    }
481
482    #[test]
483    fn chunk_utf8_safe_offsets() {
484        // Multi-byte chars (Japanese + accented Latin). The chunker must
485        // never panic and must produce offsets that land on char boundaries.
486        let para = "こんにちは世界。これは日本語のテストです。Caféの紅茶。".repeat(40);
487        let text = format!(
488            "{para}\n\n{}",
489            "Bonjour le monde. Voici un test en français.".repeat(40)
490        );
491        let cfg = ChunkConfig {
492            target_tokens: 80,
493            overlap_tokens: 10,
494        };
495        let out = chunk_text(&text, &cfg);
496        assert!(!out.is_empty());
497        for c in &out {
498            let s = c.start_offset as usize;
499            let e = c.end_offset as usize;
500            assert!(text.is_char_boundary(s), "start {s} not on char boundary");
501            assert!(text.is_char_boundary(e), "end {e} not on char boundary");
502            // The slice must equal the chunk content byte-for-byte.
503            assert_eq!(&text[s..e], c.content);
504        }
505    }
506
507    #[test]
508    fn chunk_very_large_text() {
509        // ~100KB text — should complete well under 1s.
510        let text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. ".repeat(1_800);
511        assert!(text.len() > 100_000);
512        let cfg = ChunkConfig::default();
513        let start = std::time::Instant::now();
514        let out = chunk_text(&text, &cfg);
515        let elapsed = start.elapsed();
516        assert!(out.len() > 10, "expected many chunks, got {}", out.len());
517        assert!(
518            elapsed < std::time::Duration::from_secs(2),
519            "100KB chunking too slow: {elapsed:?}"
520        );
521    }
522
523    #[test]
524    fn chunk_token_count_approximation() {
525        // 40 ASCII chars → 40/4 = 10 tokens.
526        assert_eq!(approx_token_count("0123456789".repeat(4).as_str()), 10);
527        // 12 multibyte chars → 12/4 = 3 tokens (counts CHARS not bytes).
528        assert_eq!(approx_token_count("あいうえおかきくけこさし"), 3);
529        // Empty → 0.
530        assert_eq!(approx_token_count(""), 0);
531    }
532
533    #[test]
534    fn chunk_oversized_paragraph_slides_window() {
535        // A single huge paragraph with no \n\n boundaries — the chunker
536        // must still split it into multiple chunks via sentence sliding.
537        let sentence = "This is a sentence with several words in it. ".to_string();
538        let mega_paragraph = sentence.repeat(200); // ~9000 chars ≈ 2250 tokens
539        let cfg = ChunkConfig {
540            target_tokens: 100,
541            overlap_tokens: 10,
542        };
543        let out = chunk_text(&mega_paragraph, &cfg);
544        assert!(
545            out.len() >= 3,
546            "expected oversized paragraph to be split, got {} chunks",
547            out.len()
548        );
549        // Offsets must still be monotonic + char-boundary safe.
550        for c in &out {
551            let s = c.start_offset as usize;
552            let e = c.end_offset as usize;
553            assert!(mega_paragraph.is_char_boundary(s));
554            assert!(mega_paragraph.is_char_boundary(e));
555            assert_eq!(&mega_paragraph[s..e], c.content);
556        }
557    }
558
559    #[test]
560    fn chunk_config_default_is_500_50() {
561        let c = ChunkConfig::default();
562        assert_eq!(c.target_tokens, 500);
563        assert_eq!(c.overlap_tokens, 50);
564    }
565
566    #[test]
567    fn chunk_text_offsets_cover_input_modulo_overlap() {
568        // The first chunk starts at 0, the last chunk ends at text.len().
569        let text = synthetic_text(80, 8);
570        let cfg = ChunkConfig::default();
571        let out = chunk_text(&text, &cfg);
572        assert!(!out.is_empty());
573        assert_eq!(out[0].start_offset, 0);
574        assert_eq!(out.last().unwrap().end_offset as usize, text.len());
575    }
576}