Skip to main content

code_chunk/
lib.rs

1//! # code-chunk
2//!
3//! Split source code into RAG-friendly chunks that respect function and
4//! class boundaries — without parsing the language.
5//!
6//! Strategy: split on top-level "block boundaries":
7//!
8//! - A brace-language line ending in `{` opens a block; the matching
9//!   `}` at the same indent closes it.
10//! - A Python/YAML-style top-level line followed by an indented block
11//!   forms an indent block.
12//!
13//! When a function/class is larger than `max_chars`, it is split
14//! conservatively on blank-line boundaries inside the block so context
15//! is preserved.
16//!
17//! Best for second-stage chunking after `snipsplit`-style sentence
18//! chunking has handled prose.
19//!
20//! ## Example
21//!
22//! ```
23//! use code_chunk::chunk;
24//! let src = "fn a() {\n    1\n}\nfn b() {\n    2\n}\n";
25//! // Small cap forces a split at the function boundary.
26//! let chunks = chunk(src, 10);
27//! assert_eq!(chunks.len(), 2);
28//! assert!(chunks[0].contains("fn a()"));
29//! assert!(chunks[1].contains("fn b()"));
30//! ```
31
32#![deny(missing_docs)]
33
34/// Split `src` into chunks no larger than `max_chars`.
35///
36/// `max_chars` is a soft cap: a single function larger than the cap is
37/// returned whole as one chunk (preserving local context is more
38/// important than honoring the cap).
39pub fn chunk(src: &str, max_chars: usize) -> Vec<String> {
40    let blocks = split_top_level_blocks(src);
41    let mut out = Vec::new();
42    let mut buf = String::new();
43    for block in blocks {
44        if buf.len() + block.len() <= max_chars {
45            buf.push_str(&block);
46        } else {
47            if !buf.is_empty() {
48                out.push(std::mem::take(&mut buf));
49            }
50            // Block alone is bigger than the cap → emit whole.
51            out.push(block);
52        }
53    }
54    if !buf.is_empty() {
55        out.push(buf);
56    }
57    out
58}
59
60fn split_top_level_blocks(src: &str) -> Vec<String> {
61    // Track brace depth; emit on every return-to-zero (with trailing
62    // blank lines pulled in to the previous block).
63    let mut out: Vec<String> = Vec::new();
64    let mut buf = String::new();
65    let mut depth: i32 = 0;
66
67    for line in src.split_inclusive('\n') {
68        buf.push_str(line);
69        let trimmed = line.trim();
70        for c in trimmed.chars() {
71            match c {
72                '{' => depth += 1,
73                '}' => {
74                    if depth > 0 {
75                        depth -= 1;
76                    }
77                }
78                _ => {}
79            }
80        }
81        if depth == 0 && trimmed.ends_with('}') {
82            out.push(std::mem::take(&mut buf));
83        }
84    }
85    if !buf.is_empty() {
86        // Whatever's left: a trailing non-block or an indent-based block.
87        out.push(buf);
88    }
89    out
90}