code_chunk/lib.rs
1//! # code-chunk
2//!
3//! Split source code into RAG-friendly chunks that respect function and
4//! class boundaries — without parsing the language.
5//!
6//! Strategy: split on top-level "block boundaries":
7//!
8//! - A brace-language line ending in `{` opens a block; the matching
9//! `}` at the same indent closes it.
10//! - A Python/YAML-style top-level line followed by an indented block
11//! forms an indent block.
12//!
13//! When a function/class is larger than `max_chars`, it is split
14//! conservatively on blank-line boundaries inside the block so context
15//! is preserved.
16//!
17//! Best for second-stage chunking after `snipsplit`-style sentence
18//! chunking has handled prose.
19//!
20//! ## Example
21//!
22//! ```
23//! use code_chunk::chunk;
24//! let src = "fn a() {\n 1\n}\nfn b() {\n 2\n}\n";
25//! // Small cap forces a split at the function boundary.
26//! let chunks = chunk(src, 10);
27//! assert_eq!(chunks.len(), 2);
28//! assert!(chunks[0].contains("fn a()"));
29//! assert!(chunks[1].contains("fn b()"));
30//! ```
31
32#![deny(missing_docs)]
33
34/// Split `src` into chunks no larger than `max_chars`.
35///
36/// `max_chars` is a soft cap: a single function larger than the cap is
37/// returned whole as one chunk (preserving local context is more
38/// important than honoring the cap).
39pub fn chunk(src: &str, max_chars: usize) -> Vec<String> {
40 let blocks = split_top_level_blocks(src);
41 let mut out = Vec::new();
42 let mut buf = String::new();
43 for block in blocks {
44 if buf.len() + block.len() <= max_chars {
45 buf.push_str(&block);
46 } else {
47 if !buf.is_empty() {
48 out.push(std::mem::take(&mut buf));
49 }
50 // Block alone is bigger than the cap → emit whole.
51 out.push(block);
52 }
53 }
54 if !buf.is_empty() {
55 out.push(buf);
56 }
57 out
58}
59
60fn split_top_level_blocks(src: &str) -> Vec<String> {
61 // Track brace depth; emit on every return-to-zero (with trailing
62 // blank lines pulled in to the previous block).
63 let mut out: Vec<String> = Vec::new();
64 let mut buf = String::new();
65 let mut depth: i32 = 0;
66
67 for line in src.split_inclusive('\n') {
68 buf.push_str(line);
69 let trimmed = line.trim();
70 for c in trimmed.chars() {
71 match c {
72 '{' => depth += 1,
73 '}' => {
74 if depth > 0 {
75 depth -= 1;
76 }
77 }
78 _ => {}
79 }
80 }
81 if depth == 0 && trimmed.ends_with('}') {
82 out.push(std::mem::take(&mut buf));
83 }
84 }
85 if !buf.is_empty() {
86 // Whatever's left: a trailing non-block or an indent-based block.
87 out.push(buf);
88 }
89 out
90}