solo-storage 0.11.5

Solo: SQLite + SQLCipher persistence layer
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
// SPDX-License-Identifier: Apache-2.0

//! Split a document's text into chunks for embedding.
//!
//! ## Strategy
//!
//! 1. Split the input on paragraph boundaries (`\n\n` or a Markdown-style
//!    heading line).
//! 2. Accumulate paragraphs into a chunk until the running token count
//!    reaches `target_tokens`. Emit, then start the next chunk with the
//!    last ~`overlap_tokens` worth of characters from the just-emitted
//!    chunk (to preserve context across boundaries).
//! 3. If a single paragraph itself exceeds `target_tokens * 1.5`, slide
//!    a window across it, preferring to break on sentence-ending
//!    punctuation (`.`, `!`, `?`, newline) within the last ~10% of the
//!    window.
//!
//! All offsets are byte offsets into the original `text`. They MUST land
//! on UTF-8 character boundaries — the implementation walks
//! `text.char_indices()` to guarantee that.
//!
//! Token counting is approximated as `chars / 4`. This is good enough for
//! English; for non-Latin scripts it under-estimates by ~2x, which means
//! chunks may come out a bit larger than expected. The approximation
//! lives in [`approx_token_count`] and is intentionally not pluggable —
//! the writer-actor (P3) re-derives `token_count` per chunk from the
//! same fn so there's no drift between the chunker and the persisted
//! metadata.

/// Configuration for [`chunk_text`].
///
/// Field defaults (500 / 50) come from the v0.7.0 plan; values were chosen
/// to keep each chunk well under the 8K-token context of typical embedder
/// models while still capturing a meaningful semantic unit.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ChunkConfig {
    /// Target tokens per chunk (approximation: chars/4).
    pub target_tokens: u32,
    /// Tokens of overlap between consecutive chunks. Should be < target.
    pub overlap_tokens: u32,
}

impl Default for ChunkConfig {
    fn default() -> Self {
        Self {
            target_tokens: 500,
            overlap_tokens: 50,
        }
    }
}

/// One chunk's specification.
///
/// The writer-actor (P3) materializes a `ChunkSpec` into a
/// [`solo_core::DocumentChunk`] by allocating a fresh `ChunkId`, setting
/// `doc_id`, assigning `chunk_index`, and stamping `created_at_ms`. Holding
/// those concerns out of the chunker keeps it a pure function from
/// (text, config) → list of substrings + offsets.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ChunkSpec {
    /// The chunk's text content (slice from the original document).
    pub content: String,
    /// Byte offset in the original document where this chunk starts.
    pub start_offset: u32,
    /// Byte offset in the original document where this chunk ends (exclusive).
    pub end_offset: u32,
    /// Approximate token count (chars/4) for this chunk's content.
    pub token_count: u32,
}

/// Approximate token count: 1 token ≈ 4 characters (English heuristic).
pub(crate) fn approx_token_count(text: &str) -> u32 {
    let chars = text.chars().count();
    // Saturating to u32 is fine — texts > 17 GB chars are out of scope.
    u32::try_from(chars / 4).unwrap_or(u32::MAX)
}

/// Split `text` into chunks per `config`.
///
/// Contracts (enforced by tests):
///
///   * Empty input → empty `Vec`.
///   * Text whose token count ≤ `target * 1.5` → exactly one chunk
///     spanning the whole text.
///   * Otherwise → N ≥ 2 chunks. Each chunk's content is a contiguous
///     byte-slice of `text` (no synthesis). `start_offset` /
///     `end_offset` fall on UTF-8 char boundaries.
///   * Offsets are monotonically increasing across the returned `Vec`:
///     for consecutive chunks `start[i+1] < end[i]` (overlap) and
///     `end[i+1] > end[i]` (forward progress).
pub fn chunk_text(text: &str, config: &ChunkConfig) -> Vec<ChunkSpec> {
    if text.is_empty() {
        return Vec::new();
    }
    let target = config.target_tokens.max(1);
    let overlap = config.overlap_tokens.min(target.saturating_sub(1));
    // A single-chunk emit if the whole text comfortably fits.
    let total_tokens = approx_token_count(text);
    if total_tokens <= target.saturating_mul(3) / 2 {
        return vec![ChunkSpec {
            content: text.to_string(),
            start_offset: 0,
            end_offset: u32::try_from(text.len()).unwrap_or(u32::MAX),
            token_count: total_tokens,
        }];
    }

    let paragraphs = split_paragraphs(text);
    let oversize_threshold = target.saturating_mul(3) / 2;

    let mut chunks: Vec<ChunkSpec> = Vec::new();
    let mut cursor_start: usize = 0; // byte offset of the chunk currently being assembled
    let mut cursor_end: usize = 0;   // byte offset just past the last paragraph appended
    let mut cursor_tokens: u32 = 0;

    for p in &paragraphs {
        let p_tokens = approx_token_count(&text[p.start..p.end]);

        // Oversized paragraph — flush whatever we have, then slide-window
        // across the paragraph itself.
        if p_tokens >= oversize_threshold {
            if cursor_end > cursor_start {
                push_chunk(&mut chunks, text, cursor_start, cursor_end);
            }
            slide_window(&mut chunks, text, p.start, p.end, target, overlap);
            cursor_start = window_overlap_start(text, p.end, overlap);
            cursor_end = cursor_start;
            cursor_tokens = 0;
            continue;
        }

        // Would adding this paragraph overshoot the target? Flush + restart.
        // The "would overshoot" check is `cursor_tokens + p_tokens > target * 1.5`
        // so we keep paragraphs whole when feasible.
        if cursor_end > cursor_start && cursor_tokens + p_tokens > oversize_threshold {
            push_chunk(&mut chunks, text, cursor_start, cursor_end);
            cursor_start = window_overlap_start(text, cursor_end, overlap);
            // cursor_end intentionally NOT reset here — it's overwritten
            // unconditionally below at `cursor_end = p.end`.
        }

        // Append the paragraph to the current chunk window.
        cursor_end = p.end;
        cursor_tokens = approx_token_count(&text[cursor_start..cursor_end]);

        // If we now sit exactly at or above target, flush — but only when we
        // haven't already; staying within the [target/2, target*1.5] band.
        if cursor_tokens >= target {
            push_chunk(&mut chunks, text, cursor_start, cursor_end);
            cursor_start = window_overlap_start(text, cursor_end, overlap);
            cursor_end = cursor_start;
            // cursor_tokens recomputed at the top of the next iteration if needed
        }
    }

    // Trailing chunk: anything pending after the last paragraph.
    if cursor_end > cursor_start {
        push_chunk(&mut chunks, text, cursor_start, cursor_end);
    }

    chunks
}

/// A single paragraph window into the source text.
#[derive(Debug, Clone, Copy)]
struct Paragraph {
    start: usize,
    /// Exclusive byte offset; includes the trailing paragraph separator
    /// so that consecutive paragraphs concatenate to the original text
    /// without gaps.
    end: usize,
}

/// Split on `\n\n` (paragraph) boundaries. Each paragraph's `[start, end)`
/// includes any blank-line separator that immediately follows, so
/// concatenating all paragraphs reconstructs `text` byte-for-byte.
fn split_paragraphs(text: &str) -> Vec<Paragraph> {
    let bytes = text.as_bytes();
    let n = bytes.len();
    let mut out = Vec::new();
    let mut start = 0usize;
    let mut i = 0usize;
    while i < n {
        // Find the next "\n\n" (or end-of-string).
        if i + 1 < n && bytes[i] == b'\n' && bytes[i + 1] == b'\n' {
            // Skip past the full run of newlines so the next paragraph
            // doesn't start with whitespace it can never trim away (the
            // chunker preserves byte offsets exactly).
            let mut j = i + 2;
            while j < n && bytes[j] == b'\n' {
                j += 1;
            }
            out.push(Paragraph { start, end: j });
            start = j;
            i = j;
            continue;
        }
        i += 1;
    }
    if start < n {
        out.push(Paragraph { start, end: n });
    }
    out
}

/// Compute the start of an overlap window of approximately `overlap` tokens
/// (≈ `overlap * 4` chars) ending at `end`. The returned position is
/// guaranteed to be a UTF-8 char boundary.
fn window_overlap_start(text: &str, end: usize, overlap: u32) -> usize {
    if overlap == 0 || end == 0 {
        return end;
    }
    let target_chars = (overlap as usize) * 4;
    let mut count = 0usize;
    // Iterate char-indices in reverse from `end`.
    let prefix = &text[..end];
    for (idx, _ch) in prefix.char_indices().rev() {
        count += 1;
        if count > target_chars {
            return idx;
        }
    }
    0
}

fn push_chunk(out: &mut Vec<ChunkSpec>, text: &str, start: usize, end: usize) {
    debug_assert!(start < end, "push_chunk: empty range [{start},{end})");
    debug_assert!(text.is_char_boundary(start), "start {start} not on char boundary");
    debug_assert!(text.is_char_boundary(end), "end {end} not on char boundary");
    let slice = &text[start..end];
    out.push(ChunkSpec {
        content: slice.to_string(),
        start_offset: u32::try_from(start).unwrap_or(u32::MAX),
        end_offset: u32::try_from(end).unwrap_or(u32::MAX),
        token_count: approx_token_count(slice),
    });
}

/// Slide a fixed-token window across `text[range_start..range_end]`,
/// preferring sentence-ending punctuation in the last ~10% of the window.
///
/// Used when a single paragraph is too large to fit a single chunk.
fn slide_window(
    out: &mut Vec<ChunkSpec>,
    text: &str,
    range_start: usize,
    range_end: usize,
    target: u32,
    overlap: u32,
) {
    let target_chars = (target as usize) * 4;
    let mut window_start = range_start;
    while window_start < range_end {
        // Provisional end at target_chars; then nudge to the nearest
        // sentence-ending punctuation if one exists in the last 10%.
        let mut chars_seen = 0usize;
        let mut window_end = range_end;
        let suffix = &text[window_start..range_end];
        for (idx, _ch) in suffix.char_indices() {
            chars_seen += 1;
            if chars_seen >= target_chars {
                window_end = window_start + idx;
                break;
            }
        }
        // If the natural end is within ~10% of the target, that's fine.
        // Otherwise, look back through the last 10% of the window for a
        // sentence-ending punctuation char.
        if window_end < range_end {
            let lookback = target_chars / 10;
            let snap = find_sentence_break(text, window_start, window_end, lookback);
            window_end = snap;
        }
        if window_end <= window_start {
            // Defensive: don't loop forever. Force at least one char.
            let next = text[window_start..range_end]
                .char_indices()
                .next()
                .map(|(_, c)| window_start + c.len_utf8())
                .unwrap_or(range_end);
            window_end = next;
        }
        push_chunk(out, text, window_start, window_end);
        if window_end >= range_end {
            break;
        }
        let next_start = window_overlap_start(text, window_end, overlap);
        // Guarantee monotonic progress (avoid infinite loop on pathological text).
        window_start = next_start.max(window_start + 1);
        // Re-align to char boundary going forward — `+1` may land mid-char.
        while window_start < range_end && !text.is_char_boundary(window_start) {
            window_start += 1;
        }
    }
}

/// Within `[window_end - lookback, window_end)`, find the byte offset just
/// past the last `.`, `!`, `?`, or `\n`. If none, return `window_end` unchanged.
fn find_sentence_break(
    text: &str,
    window_start: usize,
    window_end: usize,
    lookback_chars: usize,
) -> usize {
    let bytes = text.as_bytes();
    // Determine the byte offset of the start of the look-back region.
    let look_start = {
        let prefix = &text[window_start..window_end];
        let mut count = 0usize;
        let mut start_idx = 0usize;
        for (idx, _ch) in prefix.char_indices().rev() {
            count += 1;
            if count >= lookback_chars {
                start_idx = window_start + idx;
                break;
            }
        }
        if count < lookback_chars {
            window_start
        } else {
            start_idx
        }
    };
    // Walk back from window_end looking for terminal punctuation.
    let mut i = window_end;
    while i > look_start {
        let prev = match text[..i].char_indices().next_back() {
            Some((idx, _)) => idx,
            None => break,
        };
        let ch = bytes[prev];
        if ch == b'.' || ch == b'!' || ch == b'?' || ch == b'\n' {
            return i; // include the punctuation char
        }
        i = prev;
    }
    window_end
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn chunk_empty_text_returns_empty_vec() {
        let out = chunk_text("", &ChunkConfig::default());
        assert!(out.is_empty());
    }

    #[test]
    fn chunk_short_text_returns_single_chunk() {
        // Way under target — should fit in one chunk.
        let text = "Hello world. This is a tiny doc.";
        let out = chunk_text(text, &ChunkConfig::default());
        assert_eq!(out.len(), 1);
        assert_eq!(out[0].content, text);
        assert_eq!(out[0].start_offset, 0);
        assert_eq!(out[0].end_offset as usize, text.len());
    }

    /// Build a multi-paragraph synthetic text large enough to force multiple
    /// chunks. Each paragraph is ~200 chars (~50 tokens by chars/4), so
    /// at target=500 / overlap=50 we expect roughly target/50 = 10
    /// paragraphs per chunk → N chunks ≈ paragraph_count / 10.
    fn synthetic_text(paragraph_count: usize, words_per_paragraph: usize) -> String {
        let mut s = String::new();
        for p in 0..paragraph_count {
            for w in 0..words_per_paragraph {
                if w > 0 {
                    s.push(' ');
                }
                s.push_str(&format!("word{w:02}"));
            }
            s.push('.');
            if p + 1 < paragraph_count {
                s.push_str("\n\n");
            }
        }
        s
    }

    #[test]
    fn chunk_long_text_splits_into_multiple() {
        // ~500 paragraphs of ~50 chars each = ~25 000 chars = ~6 250 tokens.
        // At target=500, expect ~12+ chunks.
        let text = synthetic_text(500, 8); // ~50 chars/paragraph
        let cfg = ChunkConfig::default();
        let out = chunk_text(&text, &cfg);
        assert!(out.len() > 1, "expected multiple chunks, got {}", out.len());
        // Every chunk's content must round-trip exactly with its offsets.
        for c in &out {
            let slice = &text[c.start_offset as usize..c.end_offset as usize];
            assert_eq!(slice, c.content.as_str());
        }
    }

    #[test]
    fn chunk_respects_paragraph_boundaries() {
        // A handful of well-separated paragraphs. Each is small (~30 chars)
        // so the chunker should group them but NOT split mid-paragraph.
        let text = synthetic_text(40, 6);
        let cfg = ChunkConfig {
            target_tokens: 50,
            overlap_tokens: 5,
        };
        let out = chunk_text(&text, &cfg);
        assert!(out.len() > 1);
        // None of the chunk boundaries should fall in the middle of "wordNN"
        // — they should land on or near \n\n boundaries.
        for c in &out {
            let last_char = c.content.chars().last().unwrap();
            // The last char should be either '.' (sentence end) or '\n' or a digit
            // (when the chunker had to cut mid-paragraph because the next
            // paragraph would overshoot too much). For this size of synthetic
            // input, the chunker should mostly land on sentence ends.
            assert!(
                last_char == '.' || last_char == '\n' || last_char.is_ascii_alphanumeric(),
                "chunk ends mid-token at: {:?}",
                &c.content[c.content.len().saturating_sub(20)..]
            );
        }
    }

    #[test]
    fn chunk_target_size_band() {
        // Most chunks should fall within [target/2, target*1.5] tokens.
        let text = synthetic_text(300, 8);
        let cfg = ChunkConfig {
            target_tokens: 100,
            overlap_tokens: 10,
        };
        let out = chunk_text(&text, &cfg);
        assert!(out.len() >= 3, "need enough chunks to evaluate band");
        // Excluding the last (trailing) chunk, every chunk must be in band.
        let lower = cfg.target_tokens / 2;
        let upper = cfg.target_tokens * 3 / 2;
        for (i, c) in out.iter().enumerate().take(out.len() - 1) {
            assert!(
                c.token_count >= lower && c.token_count <= upper,
                "chunk {i} out of band: token_count={} band=[{lower},{upper}]",
                c.token_count,
            );
        }
    }

    #[test]
    fn chunk_offsets_monotonic_with_overlap() {
        let text = synthetic_text(200, 8);
        let cfg = ChunkConfig {
            target_tokens: 100,
            overlap_tokens: 10,
        };
        let out = chunk_text(&text, &cfg);
        assert!(out.len() >= 2);
        for window in out.windows(2) {
            let a = &window[0];
            let b = &window[1];
            // Forward progress
            assert!(
                b.end_offset > a.end_offset,
                "end_offset must increase across chunks: {} -> {}",
                a.end_offset,
                b.end_offset
            );
            // Overlap: next chunk starts at or before previous chunk's end.
            // With overlap_tokens=10, b.start_offset should typically be
            // a.end_offset - (~40 chars). Allow equality for cases where
            // the chunker can't find any overlap (rare).
            assert!(
                b.start_offset <= a.end_offset,
                "next chunk should overlap or abut the previous: a.end={} b.start={}",
                a.end_offset,
                b.start_offset,
            );
        }
    }

    #[test]
    fn chunk_utf8_safe_offsets() {
        // Multi-byte chars (Japanese + accented Latin). The chunker must
        // never panic and must produce offsets that land on char boundaries.
        let para = "こんにちは世界。これは日本語のテストです。Caféの紅茶。".repeat(40);
        let text = format!(
            "{para}\n\n{}",
            "Bonjour le monde. Voici un test en français.".repeat(40)
        );
        let cfg = ChunkConfig {
            target_tokens: 80,
            overlap_tokens: 10,
        };
        let out = chunk_text(&text, &cfg);
        assert!(!out.is_empty());
        for c in &out {
            let s = c.start_offset as usize;
            let e = c.end_offset as usize;
            assert!(text.is_char_boundary(s), "start {s} not on char boundary");
            assert!(text.is_char_boundary(e), "end {e} not on char boundary");
            // The slice must equal the chunk content byte-for-byte.
            assert_eq!(&text[s..e], c.content);
        }
    }

    #[test]
    fn chunk_very_large_text() {
        // ~100KB text — should complete well under 1s.
        let text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. ".repeat(1_800);
        assert!(text.len() > 100_000);
        let cfg = ChunkConfig::default();
        let start = std::time::Instant::now();
        let out = chunk_text(&text, &cfg);
        let elapsed = start.elapsed();
        assert!(out.len() > 10, "expected many chunks, got {}", out.len());
        assert!(
            elapsed < std::time::Duration::from_secs(2),
            "100KB chunking too slow: {elapsed:?}"
        );
    }

    #[test]
    fn chunk_token_count_approximation() {
        // 40 ASCII chars → 40/4 = 10 tokens.
        assert_eq!(approx_token_count("0123456789".repeat(4).as_str()), 10);
        // 12 multibyte chars → 12/4 = 3 tokens (counts CHARS not bytes).
        assert_eq!(approx_token_count("あいうえおかきくけこさし"), 3);
        // Empty → 0.
        assert_eq!(approx_token_count(""), 0);
    }

    #[test]
    fn chunk_oversized_paragraph_slides_window() {
        // A single huge paragraph with no \n\n boundaries — the chunker
        // must still split it into multiple chunks via sentence sliding.
        let sentence = "This is a sentence with several words in it. ".to_string();
        let mega_paragraph = sentence.repeat(200); // ~9000 chars ≈ 2250 tokens
        let cfg = ChunkConfig {
            target_tokens: 100,
            overlap_tokens: 10,
        };
        let out = chunk_text(&mega_paragraph, &cfg);
        assert!(
            out.len() >= 3,
            "expected oversized paragraph to be split, got {} chunks",
            out.len()
        );
        // Offsets must still be monotonic + char-boundary safe.
        for c in &out {
            let s = c.start_offset as usize;
            let e = c.end_offset as usize;
            assert!(mega_paragraph.is_char_boundary(s));
            assert!(mega_paragraph.is_char_boundary(e));
            assert_eq!(&mega_paragraph[s..e], c.content);
        }
    }

    #[test]
    fn chunk_config_default_is_500_50() {
        let c = ChunkConfig::default();
        assert_eq!(c.target_tokens, 500);
        assert_eq!(c.overlap_tokens, 50);
    }

    #[test]
    fn chunk_text_offsets_cover_input_modulo_overlap() {
        // The first chunk starts at 0, the last chunk ends at text.len().
        let text = synthetic_text(80, 8);
        let cfg = ChunkConfig::default();
        let out = chunk_text(&text, &cfg);
        assert!(!out.is_empty());
        assert_eq!(out[0].start_offset, 0);
        assert_eq!(out.last().unwrap().end_offset as usize, text.len());
    }
}