sqlite_graphrag/
chunking.rs

1//! Semantic chunking for embedding inputs (Markdown-aware, 512-token limit).
2//!
3//! Splits bodies using [`text_splitter::MarkdownSplitter`] with overlap so
4//! multi-chunk memories preserve context across chunk boundaries.
5
6// src/chunking.rs
7// Token-based chunking for E5 model (512 token limit)
8
9use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS};
10use text_splitter::{ChunkConfig, MarkdownSplitter};
11
12// Conservative heuristic to reduce the risk of underestimating the real token count
13// in Markdown, code, and multilingual text. The previous value (4 chars/token) allowed
14// chunks that were too large for some real documents.
15/// Characters per token heuristic: 2 chars/token reduces the risk of underestimating
16/// real token counts in Markdown, code, and multilingual text.
17const CHARS_PER_TOKEN: usize = 2;
18
19/// Maximum character length of a single chunk (derived from token limit × chars-per-token).
20pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
21
22/// Character overlap between consecutive chunks to preserve cross-boundary context.
23pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
24
25/// A contiguous slice of a body string identified by byte offsets.
26#[derive(Debug, Clone)]
27pub struct Chunk {
28    /// Byte offset of the first character (inclusive).
29    pub start_offset: usize,
30    /// Byte offset past the last character (exclusive).
31    pub end_offset: usize,
32    /// Approximate token count for this chunk (chars / `CHARS_PER_TOKEN`).
33    pub token_count_approx: usize,
34}
35
36/// Returns `true` when `body` exceeds `CHUNK_SIZE_CHARS` and must be split.
37pub fn needs_chunking(body: &str) -> bool {
38    body.len() > CHUNK_SIZE_CHARS
39}
40
41/// Splits `body` into overlapping [`Chunk`]s using a character-based heuristic.
42///
43/// Short bodies (≤ `CHUNK_SIZE_CHARS`) are returned as a single chunk.
44/// Splits prefer paragraph breaks, then sentence-end punctuation, then word boundaries.
45///
46/// # Errors
47/// This function is infallible; it returns a `Vec` directly.
48pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
49    if !needs_chunking(body) {
50        return vec![Chunk {
51            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
52            start_offset: 0,
53            end_offset: body.len(),
54        }];
55    }
56
57    let mut chunks = Vec::with_capacity(body.len() / CHUNK_SIZE_CHARS + 1);
58    let mut start = 0usize;
59
60    while start < body.len() {
61        start = next_char_boundary(body, start);
62        let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
63        let end = if desired_end < body.len() {
64            find_split_boundary(body, start, desired_end)
65        } else {
66            desired_end
67        };
68
69        let end = if end <= start {
70            let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
71            if fallback > start {
72                fallback
73            } else {
74                body.len()
75            }
76        } else {
77            end
78        };
79
80        let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
81        chunks.push(Chunk {
82            start_offset: start,
83            end_offset: end,
84            token_count_approx,
85        });
86
87        if end >= body.len() {
88            break;
89        }
90
91        let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
92        start = if next_start >= end { end } else { next_start };
93    }
94
95    chunks
96}
97
98/// Splits `body` into [`Chunk`]s using pre-computed token byte-offsets.
99///
100/// Each element of `token_offsets` is a `(start, end)` byte range for one token.
101/// Respects `CHUNK_SIZE_TOKENS` and `CHUNK_OVERLAP_TOKENS` constants.
102/// Short bodies (≤ `CHUNK_SIZE_TOKENS` tokens) are returned as a single chunk.
103pub fn split_into_chunks_by_token_offsets(
104    body: &str,
105    token_offsets: &[(usize, usize)],
106) -> Vec<Chunk> {
107    if token_offsets.len() <= CHUNK_SIZE_TOKENS {
108        return vec![Chunk {
109            token_count_approx: token_offsets.len(),
110            start_offset: 0,
111            end_offset: body.len(),
112        }];
113    }
114
115    let mut chunks = Vec::with_capacity(token_offsets.len() / CHUNK_SIZE_TOKENS + 1);
116    let mut start_token = 0usize;
117
118    while start_token < token_offsets.len() {
119        let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
120
121        chunks.push(Chunk {
122            start_offset: if start_token == 0 {
123                0
124            } else {
125                token_offsets[start_token].0
126            },
127            end_offset: if end_token == token_offsets.len() {
128                body.len()
129            } else {
130                token_offsets[end_token - 1].1
131            },
132            token_count_approx: end_token - start_token,
133        });
134
135        if end_token == token_offsets.len() {
136            break;
137        }
138
139        let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
140        start_token = if next_start <= start_token {
141            end_token
142        } else {
143            next_start
144        };
145    }
146
147    chunks
148}
149
150/// Splits body into chunks using MarkdownSplitter with a real tokenizer.
151/// Respects Markdown semantic boundaries (H1-H6, paragraphs, blocks).
152/// For plain text without Markdown markers, falls back to paragraph and sentence breaks.
153///
154/// v1.0.76: the `tokenizer` parameter was removed. The chunker now uses the
155/// char-based heuristic (`CHARS_PER_TOKEN = 2`) which is the same heuristic
156/// the rest of the codebase uses for `Chunk::token_count_approx`.
157// expect_used (audited v1.0.97): overlap < chunk size is a const-derived
158// compile-time invariant (CHUNK_OVERLAP_TOKENS < CHUNK_SIZE_TOKENS).
159#[allow(clippy::expect_used)]
160pub fn split_into_chunks_hierarchical(body: &str) -> Vec<Chunk> {
161    if body.is_empty() {
162        return Vec::new();
163    }
164
165    // v1.0.76: text_splitter 0.30.1's `ChunkConfig::new` defaults to a
166    // char-count sizer when no explicit sizer is set. The default chunk
167    // size is in characters, not tokens, so we scale `CHUNK_SIZE_TOKENS`
168    // by `CHARS_PER_TOKEN` to keep the chunk size roughly equivalent.
169    let char_chunk_size = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
170    let char_overlap = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
171    let config = ChunkConfig::new(char_chunk_size)
172        .with_overlap(char_overlap)
173        .expect("compile-time invariant: CHUNK_OVERLAP must be smaller than chunk size");
174
175    let splitter = MarkdownSplitter::new(config);
176
177    let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
178
179    if items.is_empty() {
180        return vec![Chunk {
181            start_offset: 0,
182            end_offset: body.len(),
183            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
184        }];
185    }
186
187    items
188        .into_iter()
189        .map(|(start, text)| {
190            let end = start + text.len();
191            Chunk {
192                start_offset: start,
193                end_offset: end,
194                token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
195            }
196        })
197        .collect()
198}
199
200/// Returns the string slice of `body` described by `chunk`'s byte offsets.
201pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
202    &body[chunk.start_offset..chunk.end_offset]
203}
204
205fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
206    let slice = &body[start..desired_end];
207    if let Some(pos) = slice.rfind("\n\n") {
208        return start + pos + 2;
209    }
210    if let Some(pos) = slice.rfind(". ") {
211        return start + pos + 2;
212    }
213    if let Some(pos) = slice.rfind(' ') {
214        return start + pos + 1;
215    }
216    desired_end
217}
218
219fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
220    idx = idx.min(body.len());
221    while idx > 0 && !body.is_char_boundary(idx) {
222        idx -= 1;
223    }
224    idx
225}
226
227fn next_char_boundary(body: &str, mut idx: usize) -> usize {
228    idx = idx.min(body.len());
229    while idx < body.len() && !body.is_char_boundary(idx) {
230        idx += 1;
231    }
232    idx
233}
234
235/// Computes the mean of `chunk_embeddings` and L2-normalizes the result.
236///
237/// Returns a zero-vector of the active embedding dimensionality when the
238/// input is empty. When a single embedding is provided it is returned
239/// as-is (no copy).
240pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
241    if chunk_embeddings.is_empty() {
242        return vec![0.0f32; crate::constants::embedding_dim()];
243    }
244    if chunk_embeddings.len() == 1 {
245        return chunk_embeddings[0].clone();
246    }
247
248    let dim = chunk_embeddings[0].len();
249    let mut mean = vec![0.0f32; dim];
250    for emb in chunk_embeddings {
251        for (i, v) in emb.iter().enumerate() {
252            mean[i] += v;
253        }
254    }
255    let n = chunk_embeddings.len() as f32;
256    for v in &mut mean {
257        *v /= n;
258    }
259
260    let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
261    if norm > 1e-9 {
262        for v in &mut mean {
263            *v /= norm;
264        }
265    }
266    mean
267}
268
269/// Budget assessment of a body for the auto-split and dry-run paths
270/// (GAP-SG-05/06).
271#[derive(Debug, Clone, Copy)]
272pub struct BodyBudget {
273    /// Total body length in bytes.
274    pub bytes: usize,
275    /// Conservative cl100k token count of the whole body.
276    pub approx_tokens: usize,
277    /// Number of embedding chunks the body splits into.
278    pub chunk_count: usize,
279    /// Number of sub-memories auto-split would produce (1 when the body fits).
280    pub partition_count: usize,
281    /// True when the body exceeds at least one single-memory budget and would
282    /// be auto-split.
283    pub exceeds_limits: bool,
284}
285
286/// Returns the number of embedding chunks `body` splits into, using the same
287/// hierarchical splitter the persistence path uses (GAP-SG-05).
288pub fn estimate_chunk_count(body: &str) -> usize {
289    split_into_chunks_hierarchical(body).len()
290}
291
292/// Returns `true` when `body` fits a single memory without auto-split: under
293/// the partition byte budget, the safe chunk ceiling, and the embedding request
294/// token ceiling.
295fn fits_single_partition(body: &str) -> bool {
296    body.len() <= crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES
297        && estimate_chunk_count(body) <= crate::constants::REMEMBER_MAX_SAFE_MULTI_CHUNKS
298        && crate::tokenizer::count_tokens(body) <= crate::constants::EMBEDDING_REQUEST_MAX_TOKENS
299}
300
301/// Assesses `body` against the single-memory budgets (GAP-SG-05/06).
302///
303/// Used by `ingest --dry-run` to report chunk and token counts and how many
304/// sub-memories an auto-split would create.
305pub fn assess_body_budget(body: &str) -> BodyBudget {
306    let partition_count = split_body_by_sections(body).len();
307    BodyBudget {
308        bytes: body.len(),
309        approx_tokens: crate::tokenizer::count_tokens(body),
310        chunk_count: estimate_chunk_count(body),
311        partition_count,
312        exceeds_limits: partition_count > 1,
313    }
314}
315
316/// Splits a large `body` into sub-memory partitions at Markdown section
317/// boundaries (ATX headers), keeping each partition below the byte, chunk and
318/// token budgets (GAP-SG-04/07).
319///
320/// A body that already fits a single memory is returned unchanged as a single
321/// element. Otherwise the body is cut at ATX header lines, sections are greedily
322/// packed into partitions under [`crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES`],
323/// and any partition still over budget (e.g. one huge headerless section) is
324/// hard-sliced on char boundaries. Concatenating the returned partitions
325/// reproduces `body` exactly (lossless).
326pub fn split_body_by_sections(body: &str) -> Vec<String> {
327    if fits_single_partition(body) {
328        return vec![body.to_string()];
329    }
330
331    let max_bytes = crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES;
332    let sections = split_markdown_sections(body);
333
334    let mut packed: Vec<String> = Vec::new();
335    let mut current = String::new();
336    for section in sections {
337        if !current.is_empty() && current.len() + section.len() > max_bytes {
338            packed.push(std::mem::take(&mut current));
339        }
340        current.push_str(&section);
341    }
342    if !current.is_empty() {
343        packed.push(current);
344    }
345
346    let mut result = Vec::with_capacity(packed.len());
347    for partition in packed {
348        if fits_single_partition(&partition) {
349            result.push(partition);
350        } else {
351            result.extend(hard_slice_to_budget(&partition));
352        }
353    }
354
355    if result.is_empty() {
356        vec![body.to_string()]
357    } else {
358        result
359    }
360}
361
362/// Cuts `body` into sections, each starting at an ATX Markdown header line and
363/// running until the next header. Leading content before the first header is the
364/// first section. Sections retain their original text (trailing newlines
365/// included) so concatenation is lossless.
366fn split_markdown_sections(body: &str) -> Vec<String> {
367    let mut sections: Vec<String> = Vec::new();
368    let mut current = String::new();
369    for line in body.split_inclusive('\n') {
370        if is_atx_header(line) && !current.is_empty() {
371            sections.push(std::mem::take(&mut current));
372        }
373        current.push_str(line);
374    }
375    if !current.is_empty() {
376        sections.push(current);
377    }
378    if sections.is_empty() {
379        sections.push(body.to_string());
380    }
381    sections
382}
383
384/// Returns `true` when `line` is an ATX Markdown header: 1..=6 leading `#`
385/// followed by a space, tab or line end. Up to leading spaces are tolerated.
386fn is_atx_header(line: &str) -> bool {
387    let trimmed = line.trim_start_matches(' ');
388    let hashes = trimmed.chars().take_while(|&c| c == '#').count();
389    if hashes == 0 || hashes > 6 {
390        return false;
391    }
392    let after = &trimmed[hashes..];
393    after.is_empty() || after.starts_with(' ') || after.starts_with('\n') || after.starts_with('\t')
394}
395
396/// Hard-slices `body` on char boundaries into pieces no larger than the
397/// partition byte budget. The fallback for a single Markdown section that alone
398/// exceeds the budget; an 80 KiB piece is always under the chunk and token
399/// ceilings (see [`crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES`]).
400fn hard_slice_to_budget(body: &str) -> Vec<String> {
401    let max_bytes = crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES;
402    let mut pieces: Vec<String> = Vec::new();
403    let mut start = 0usize;
404    while start < body.len() {
405        let mut end = previous_char_boundary(body, (start + max_bytes).min(body.len()));
406        if end <= start {
407            end = next_char_boundary(body, start + 1);
408        }
409        pieces.push(body[start..end].to_string());
410        start = end;
411    }
412    pieces
413}
414
415#[cfg(test)]
416mod tests {
417    use super::*;
418
419    #[test]
420    fn test_short_body_no_chunking() {
421        let body = "short text";
422        assert!(!needs_chunking(body));
423        let chunks = split_into_chunks(body);
424        assert_eq!(chunks.len(), 1);
425        assert_eq!(chunk_text(body, &chunks[0]), body);
426    }
427
428    #[test]
429    fn test_long_body_produces_multiple_chunks() {
430        let body = "word ".repeat(1000);
431        assert!(needs_chunking(&body));
432        let chunks = split_into_chunks(&body);
433        assert!(chunks.len() > 1);
434        assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
435    }
436
437    #[test]
438    fn split_by_token_offsets_respeita_limite_e_overlap() {
439        let body = "ab".repeat(460);
440        let offsets: Vec<(usize, usize)> = (0..460)
441            .map(|i| {
442                let start = i * 2;
443                (start, start + 2)
444            })
445            .collect();
446
447        let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
448        assert_eq!(chunks.len(), 2);
449        assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
450        assert_eq!(chunks[1].token_count_approx, 110);
451        assert_eq!(chunks[0].start_offset, 0);
452        assert_eq!(
453            chunks[1].start_offset,
454            offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
455        );
456    }
457
458    #[test]
459    fn split_by_token_offsets_returns_one_chunk_when_fits() {
460        let body = "texto curto";
461        let offsets = vec![(0, 5), (6, 11)];
462        let chunks = split_into_chunks_by_token_offsets(body, &offsets);
463        assert_eq!(chunks.len(), 1);
464        assert_eq!(chunks[0].start_offset, 0);
465        assert_eq!(chunks[0].end_offset, body.len());
466        assert_eq!(chunks[0].token_count_approx, 2);
467    }
468
469    #[test]
470    fn test_multibyte_body_preserves_progress_and_boundaries() {
471        // Multibyte body intentionally includes 2-byte UTF-8 sequences (Latin-1 supplement)
472        // expressed as Unicode escapes so this source file remains ASCII-only per the
473        // language policy. The original PT-BR phrase "a\u{e7}\u{e3}o \u{fa}til " is preserved
474        // since the test exercises UTF-8 char-boundary handling.
475        let body = "a\u{e7}\u{e3}o \u{fa}til ".repeat(1000);
476        let chunks = split_into_chunks(&body);
477        assert!(chunks.len() > 1);
478        for chunk in &chunks {
479            assert!(!chunk_text(&body, chunk).is_empty());
480            assert!(body.is_char_boundary(chunk.start_offset));
481            assert!(body.is_char_boundary(chunk.end_offset));
482            assert!(chunk.end_offset > chunk.start_offset);
483        }
484        for pair in chunks.windows(2) {
485            assert!(pair[1].start_offset >= pair[0].start_offset);
486            assert!(pair[1].end_offset > pair[0].start_offset);
487        }
488    }
489
490    #[test]
491    fn test_aggregate_embeddings_normalizes() {
492        let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
493        let agg = aggregate_embeddings(&embs);
494        let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
495        assert!((norm - 1.0).abs() < 1e-5);
496    }
497
498    fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
499        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
500        if body.is_empty() {
501            return Vec::new();
502        }
503        let config = ChunkConfig::new(size)
504            .with_sizer(Characters)
505            .with_overlap(0)
506            .expect("overlap must be smaller than size");
507        let splitter = MarkdownSplitter::new(config);
508        let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
509        if items.is_empty() {
510            return vec![Chunk {
511                start_offset: 0,
512                end_offset: body.len(),
513                token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
514            }];
515        }
516        items
517            .into_iter()
518            .map(|(start, text)| {
519                let end = start + text.len();
520                Chunk {
521                    start_offset: start,
522                    end_offset: end,
523                    token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
524                }
525            })
526            .collect()
527    }
528
529    #[test]
530    fn test_hierarchical_empty_body_returns_empty() {
531        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
532        let config = ChunkConfig::new(100)
533            .with_sizer(Characters)
534            .with_overlap(0)
535            .expect("overlap < size");
536        let splitter = MarkdownSplitter::new(config);
537        let result: Vec<_> = splitter.chunk_indices("").collect();
538        assert!(result.is_empty());
539    }
540
541    #[test]
542    fn test_markdown_h1_boundary_yields_two_chunks() {
543        let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
544        let chunks = split_hier_chars(body, 30);
545        assert!(
546            chunks.len() >= 2,
547            "expected >=2 chunks, got {}",
548            chunks.len()
549        );
550        for c in &chunks {
551            assert!(body.is_char_boundary(c.start_offset));
552            assert!(body.is_char_boundary(c.end_offset));
553        }
554    }
555
556    #[test]
557    fn test_markdown_h2_nested_respects_boundaries() {
558        let body = "# H1\n\n## H2a\n\nParagraph A with enough text to force a split.\n\n## H2b\n\nParagraph B with enough text to force a split as well.";
559        let chunks = split_hier_chars(body, 40);
560        assert!(!chunks.is_empty());
561        for c in &chunks {
562            assert!(body.is_char_boundary(c.start_offset));
563            assert!(body.is_char_boundary(c.end_offset));
564            assert!(c.end_offset > c.start_offset);
565            assert!(c.end_offset <= body.len());
566        }
567    }
568
569    #[test]
570    fn test_markdown_paragraph_soft_boundary() {
571        let para = "Plain text sentence used to fill the paragraph. ";
572        let body = format!(
573            "{}\n\n{}\n\n{}",
574            para.repeat(3),
575            para.repeat(3),
576            para.repeat(3)
577        );
578        let chunks = split_hier_chars(&body, 80);
579        assert!(
580            chunks.len() >= 2,
581            "expected >=2 chunks with a body of {} chars",
582            body.len()
583        );
584        for c in &chunks {
585            assert!(body.is_char_boundary(c.start_offset));
586            assert!(body.is_char_boundary(c.end_offset));
587        }
588    }
589
590    #[test]
591    fn test_markdown_60kb_valid_offsets() {
592        let block = "# Section\n\nBlock content text. ".repeat(1700);
593        assert!(
594            block.len() > 50_000,
595            "body must be >50KB, has {} bytes",
596            block.len()
597        );
598        let chunks = split_hier_chars(&block, 256);
599        assert!(chunks.len() > 1);
600        for c in &chunks {
601            assert!(block.is_char_boundary(c.start_offset));
602            assert!(block.is_char_boundary(c.end_offset));
603            assert!(c.end_offset > c.start_offset);
604            assert!(!chunk_text(&block, c).is_empty());
605        }
606    }
607
608    #[test]
609    fn test_fallback_plain_text_without_markers() {
610        let body = "a ".repeat(1000);
611        let chunks = split_hier_chars(&body, 100);
612        assert!(!chunks.is_empty());
613        for c in &chunks {
614            assert!(body.is_char_boundary(c.start_offset));
615            assert!(body.is_char_boundary(c.end_offset));
616        }
617    }
618
619    // ── GAP-SG-04/05/06/07: budget assessment and section auto-split ──
620
621    fn assert_partition_within_limits(p: &str) {
622        assert!(
623            p.len() <= crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES,
624            "partition {} bytes exceeds byte budget",
625            p.len()
626        );
627        assert!(
628            estimate_chunk_count(p) <= crate::constants::REMEMBER_MAX_SAFE_MULTI_CHUNKS,
629            "partition exceeds chunk budget"
630        );
631        assert!(
632            crate::tokenizer::count_tokens(p) <= crate::constants::EMBEDDING_REQUEST_MAX_TOKENS,
633            "partition exceeds token budget"
634        );
635    }
636
637    #[test]
638    fn assess_body_budget_small_body_fits() {
639        let budget = assess_body_budget("# Title\n\nshort body");
640        assert_eq!(budget.partition_count, 1);
641        assert!(!budget.exceeds_limits);
642        assert!(budget.chunk_count >= 1);
643        assert!(budget.approx_tokens >= 1);
644    }
645
646    #[test]
647    fn split_body_by_sections_returns_single_for_small_body() {
648        let body = "# H\n\nsmall";
649        let parts = split_body_by_sections(body);
650        assert_eq!(parts.len(), 1);
651        assert_eq!(parts[0], body);
652    }
653
654    #[test]
655    fn split_body_by_sections_partitions_large_markdown_below_limits() {
656        // ~400 sections of ~520 bytes each => ~208 KB, above the 80 KiB budget.
657        let mut body = String::new();
658        for i in 0..400 {
659            body.push_str(&format!("# Section {i}\n\n{}\n\n", "body text ".repeat(50)));
660        }
661        assert!(body.len() > crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES);
662
663        let parts = split_body_by_sections(&body);
664        assert!(
665            parts.len() > 1,
666            "expected multiple partitions, got {}",
667            parts.len()
668        );
669        for p in &parts {
670            assert_partition_within_limits(p);
671        }
672        // Lossless: concatenation reproduces the original body exactly.
673        assert_eq!(parts.concat(), body);
674    }
675
676    #[test]
677    fn split_body_by_sections_hard_slices_headerless_body() {
678        // No ATX headers and far above the byte budget => hard-slice fallback.
679        let body = "x".repeat(crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES * 3 + 17);
680        let parts = split_body_by_sections(&body);
681        assert!(parts.len() > 1);
682        for p in &parts {
683            assert_partition_within_limits(p);
684        }
685        assert_eq!(parts.concat(), body);
686    }
687
688    #[test]
689    fn is_atx_header_recognizes_headers() {
690        assert!(is_atx_header("# Title"));
691        assert!(is_atx_header("### Sub\n"));
692        assert!(is_atx_header("  ## Indented"));
693        assert!(!is_atx_header("####### too many"));
694        assert!(!is_atx_header("#nospace"));
695        assert!(!is_atx_header("plain text"));
696    }
697}
sqlite_graphrag/chunking.rs

sqlite_graphrag/
chunking.rs