ripvec_core/encoder/ripvec/chunking.rs
1//! Tree-sitter AST-merge chunker (semble flavor).
2//!
3//! Port of `~/src/semble/src/semble/chunking/{core,chunking}.py`.
4//! Greedily walks AST children, packing siblings up to a 1500-char
5//! target before recursing into oversized nodes. Falls back to
6//! line-merge chunking for unsupported languages.
7//!
8//! The algorithm differs from ripvec's query-based chunker in
9//! [`crate::chunk`]: it does not extract specific definitions, it merges
10//! AST subtrees until they fill the size budget. This produces fewer,
11//! larger, more contextually-coherent chunks for semantic embedding.
12//!
13//! ## Boundary shape
14//!
15//! [`ChunkBoundary`] reports byte offsets (matching tree-sitter's
16//! native units) and 1-based line numbers (matching CodeChunk's
17//! convention). Callers usually pass [`ChunkBoundary::content`] +
18//! line numbers into [`CodeChunk`] construction.
19
20use tree_sitter::{Language, Node, Parser, TreeCursor};
21
22/// Default desired chunk length used by [`chunk_source`] when callers
23/// don't specify one (mirrors `_DESIRED_CHUNK_LENGTH_CHARS` from
24/// `chunking.py:10`).
25pub const DEFAULT_DESIRED_CHUNK_CHARS: usize = 1500;
26
27/// Inclusive byte span and (1-based) line range for one chunk.
28#[derive(Debug, Clone, PartialEq, Eq)]
29pub struct ChunkBoundary {
30 /// First byte of the chunk (UTF-8 byte offset).
31 pub start_byte: usize,
32 /// One-past-last byte of the chunk.
33 pub end_byte: usize,
34 /// 1-based start line.
35 pub start_line: usize,
36 /// 1-based end line.
37 pub end_line: usize,
38}
39
40impl ChunkBoundary {
41 /// Slice the chunk's content out of the source string.
42 ///
43 /// Caller guarantees `start_byte..end_byte` lies on UTF-8 boundaries
44 /// (tree-sitter node positions always do).
45 #[must_use]
46 pub fn content<'a>(&self, source: &'a str) -> &'a str {
47 // Clamp end_byte to source length defensively (in case a node's
48 // end position lies one past the source end, e.g., for trailing
49 // EOF tokens). Also walk back to the previous UTF-8 char
50 // boundary at each end so the slice never panics on
51 // multi-byte glyphs (tree-sitter node positions are byte-
52 // accurate but our adjacent-chunk merges occasionally widen the
53 // span; defence in depth).
54 let end = clamp_to_char_boundary(source, self.end_byte.min(source.len()), true);
55 let start = clamp_to_char_boundary(source, self.start_byte.min(end), false);
56 &source[start..end]
57 }
58}
59
60/// Walk `byte` to the nearest UTF-8 character boundary. `forward = true`
61/// rounds up (to the byte right after a multi-byte glyph); `false`
62/// rounds down (to the start of one). `&str::is_char_boundary` is the
63/// gate; the loop is bounded by a multi-byte glyph max width (4).
64fn clamp_to_char_boundary(source: &str, mut byte: usize, forward: bool) -> usize {
65 byte = byte.min(source.len());
66 for _ in 0..4 {
67 if source.is_char_boundary(byte) {
68 return byte;
69 }
70 if forward {
71 byte = (byte + 1).min(source.len());
72 } else {
73 byte = byte.saturating_sub(1);
74 }
75 }
76 byte
77}
78
79// ---------------------------------------------------------------------------
80// Public API.
81// ---------------------------------------------------------------------------
82
83/// Chunk source text via tree-sitter AST-merge or line-merge fallback.
84///
85/// `language = Some(lang)` parses with tree-sitter and runs the AST-merge
86/// algorithm. `language = None` runs the line-merge fallback.
87///
88/// Empty / whitespace-only input returns an empty `Vec`.
89#[must_use]
90pub fn chunk_source(
91 source: &str,
92 language: Option<&Language>,
93 desired_length: usize,
94) -> Vec<ChunkBoundary> {
95 if source.trim().is_empty() {
96 return Vec::new();
97 }
98 if let Some(lang) = language
99 && let Some(tree) = parse_source(source, lang)
100 {
101 return chunk_tree(source, tree.root_node(), desired_length);
102 }
103 chunk_lines(source, desired_length)
104}
105
106/// Chunk source by lines, merging adjacent lines up to `desired_length`.
107///
108/// Mirrors `chunk_lines` from `core.py:112`. The fallback chunker for
109/// files without a tree-sitter grammar (or whose parse fails).
110#[must_use]
111pub fn chunk_lines(source: &str, desired_length: usize) -> Vec<ChunkBoundary> {
112 if source.trim().is_empty() {
113 return Vec::new();
114 }
115 // Build per-line boundaries (start..end_inclusive of '\n').
116 let mut line_boundaries: Vec<ChunkBoundary> = Vec::new();
117 let mut start = 0;
118 let mut line_no = 1;
119 for line in source.split_inclusive('\n') {
120 let end = start + line.len();
121 // Lines from split_inclusive may end in '\n'; tree-sitter convention is
122 // 1-based start; the line itself spans [start, end) bytes.
123 let line_count = line.bytes().filter(|&b| b == b'\n').count().max(1);
124 line_boundaries.push(ChunkBoundary {
125 start_byte: start,
126 end_byte: end,
127 start_line: line_no,
128 end_line: line_no + line_count - 1,
129 });
130 start = end;
131 line_no += line_count;
132 }
133 merge_adjacent_chunks(line_boundaries, desired_length, source)
134}
135
136// ---------------------------------------------------------------------------
137// Tree-sitter path.
138// ---------------------------------------------------------------------------
139
140fn parse_source(source: &str, language: &Language) -> Option<tree_sitter::Tree> {
141 let mut parser = Parser::new();
142 parser.set_language(language).ok()?;
143 parser.parse(source, None)
144}
145
146fn chunk_tree(source: &str, root: Node<'_>, desired_length: usize) -> Vec<ChunkBoundary> {
147 let raw = merge_node_inner(root, desired_length);
148 let with_lines = raw
149 .into_iter()
150 .map(|(start, end)| ChunkBoundary {
151 start_byte: start,
152 end_byte: end,
153 start_line: line_at_byte(source, start),
154 end_line: line_at_byte(source, end.saturating_sub(1).max(start)),
155 })
156 .collect();
157 merge_adjacent_chunks(with_lines, desired_length, source)
158}
159
160fn line_at_byte(source: &str, byte: usize) -> usize {
161 // 1-based line number: count newlines in the source up to `byte`.
162 // Slice on `as_bytes()` rather than the &str so we can land on any
163 // byte position — `&source[..n]` panics when `n` is mid-character
164 // (e.g., for a 3-byte glyph like '─' (U+2500) at bytes [k, k+1, k+2],
165 // `source[..k+2]` is a char-boundary violation). The callers compute
166 // `end.saturating_sub(1)` for end-line which often lands inside a
167 // multi-byte glyph; byte-level slicing is the right primitive.
168 let clamped = byte.min(source.len());
169 1 + bytecount::count(&source.as_bytes()[..clamped], b'\n')
170}
171
172/// Walk a node's children depth-first, packing siblings into spans up
173/// to `desired_length` bytes. Mirrors `_merge_node_inner` from
174/// `core.py:65`. Returns byte spans `(start, end)`.
175fn merge_node_inner(node: Node<'_>, desired_length: usize) -> Vec<(usize, usize)> {
176 let mut cursor = node.walk();
177 if !cursor.goto_first_child() {
178 // Leaf node — single span.
179 return vec![(node.start_byte(), node.end_byte())];
180 }
181
182 let mut groups: Vec<(usize, usize)> = Vec::new();
183 loop {
184 let child = cursor.node();
185 let start = child.start_byte();
186 let mut end = child.end_byte();
187 let mut length = end - start;
188
189 if length > desired_length {
190 // Recurse into oversized children.
191 groups.extend(merge_node_inner(child, desired_length));
192 if !cursor.goto_next_sibling() {
193 break;
194 }
195 continue;
196 }
197
198 // Extend the current group with subsequent siblings that fit.
199 while let Some(next) = peek_next_sibling(&cursor) {
200 let next_length = next.end_byte() - next.start_byte();
201 if length + next_length > desired_length {
202 break;
203 }
204 cursor.goto_next_sibling();
205 end = cursor.node().end_byte();
206 length += next_length;
207 }
208
209 groups.push((start, end));
210 if !cursor.goto_next_sibling() {
211 break;
212 }
213 }
214 groups
215}
216
217/// Peek at the next sibling without advancing the cursor.
218///
219/// `TreeCursor` has no native peek; clone the cursor, advance, and read
220/// the node. Tree-sitter cursors are cheap to clone.
221fn peek_next_sibling<'a>(cursor: &TreeCursor<'a>) -> Option<Node<'a>> {
222 let mut probe = cursor.clone();
223 if probe.goto_next_sibling() {
224 Some(probe.node())
225 } else {
226 None
227 }
228}
229
230/// Merge adjacent chunks up to `desired_length`. Mirrors
231/// `_merge_adjacent_chunks` from `core.py:35`. Operates on byte spans.
232fn merge_adjacent_chunks(
233 chunks: Vec<ChunkBoundary>,
234 desired_length: usize,
235 source: &str,
236) -> Vec<ChunkBoundary> {
237 if chunks.is_empty() {
238 return chunks;
239 }
240 let mut merged: Vec<ChunkBoundary> = Vec::new();
241 let mut current = chunks[0].clone();
242 let mut current_length = current.end_byte - current.start_byte;
243
244 for next in chunks.into_iter().skip(1) {
245 let nlen = next.end_byte - next.start_byte;
246 if current_length + nlen > desired_length {
247 merged.push(current);
248 current = next;
249 current_length = nlen;
250 } else {
251 current.end_byte = next.end_byte;
252 current.end_line = next.end_line;
253 current_length += nlen;
254 }
255 }
256 merged.push(current);
257 // Recompute end_line at boundaries to be safe (the byte-span recipe
258 // can leave end_line stale when merging across multi-line blocks).
259 for b in &mut merged {
260 b.end_line = line_at_byte(source, b.end_byte.saturating_sub(1).max(b.start_byte));
261 }
262 merged
263}
264
265#[cfg(test)]
266mod tests {
267 use super::*;
268
269 /// `test:chunker-line-fallback` — line-merge fallback packs lines up
270 /// to the budget; lines longer than the budget get their own chunk.
271 #[test]
272 fn chunker_line_fallback() {
273 let source = "aaa\nbbbbb\nccc\nddd\n";
274 // budget=8 should merge "aaa\n" (4) + "ccc\n" (4)? No — adjacency.
275 // Actually: line1=4, line2=6 (bbbbb\n), line3=4, line4=4.
276 // current=line1 (len 4). next line2 (6). 4+6=10 > 8, flush. current=line2.
277 // next line3 (4). 6+4=10 > 8, flush. current=line3.
278 // next line4 (4). 4+4=8 <= 8, merge. current=line3+line4 (8).
279 let chunks = chunk_lines(source, 8);
280 assert_eq!(chunks.len(), 3);
281 assert_eq!(chunks[0].content(source), "aaa\n");
282 assert_eq!(chunks[1].content(source), "bbbbb\n");
283 assert_eq!(chunks[2].content(source), "ccc\nddd\n");
284 }
285
286 /// Empty input returns empty.
287 #[test]
288 fn empty_input_returns_empty() {
289 assert_eq!(chunk_lines("", 100), Vec::<ChunkBoundary>::new());
290 assert_eq!(chunk_lines(" \n ", 100), Vec::<ChunkBoundary>::new());
291 assert_eq!(chunk_source("", None, 100), Vec::<ChunkBoundary>::new());
292 }
293
294 /// `test:chunker-merge-adjacent` — adjacent small AST nodes pack
295 /// together up to the byte budget. Uses a Python source that has
296 /// short adjacent function definitions.
297 #[test]
298 fn chunker_merge_adjacent() {
299 let lang: Language = tree_sitter_python::LANGUAGE.into();
300 // Three small functions; combined ~80 chars. With budget 200 they should
301 // merge into one chunk. With budget 50, each stands alone.
302 let source = "def a():\n return 1\n\ndef b():\n return 2\n\ndef c():\n return 3\n";
303 let combined = chunk_source(source, Some(&lang), 200);
304 let split = chunk_source(source, Some(&lang), 50);
305 assert!(
306 combined.len() <= split.len(),
307 "larger budget should produce fewer-or-equal chunks; got combined={} split={}",
308 combined.len(),
309 split.len()
310 );
311 // The content cover the whole source modulo tree-sitter trailing-whitespace
312 // semantics.
313 let combined_chars: usize = combined.iter().map(|c| c.end_byte - c.start_byte).sum();
314 assert!(
315 combined_chars >= source.len() - 4,
316 "chunks should cover ~all of source ({combined_chars} of {})",
317 source.len()
318 );
319 }
320
321 /// `test:chunker-recurses-oversized-nodes` — a single oversized
322 /// node splits into multiple smaller chunks.
323 #[test]
324 fn chunker_recurses_oversized_nodes() {
325 let lang: Language = tree_sitter_python::LANGUAGE.into();
326 // Build a function with many statements so its body exceeds the budget;
327 // tree-sitter should let merge_node_inner recurse into the function body
328 // and split it.
329 let mut body = String::new();
330 for i in 0..40 {
331 use std::fmt::Write;
332 let _ = writeln!(&mut body, " x{i} = {i}");
333 }
334 let source = format!("def big_function():\n{body}");
335 let chunks = chunk_source(&source, Some(&lang), 100);
336 assert!(
337 chunks.len() > 1,
338 "oversized function should split into multiple chunks; got {}",
339 chunks.len()
340 );
341 }
342
343 /// `property:chunker-parity-python` — chunks cover the full source
344 /// without overlap and without leaving gaps (modulo whitespace).
345 #[test]
346 fn property_chunker_parity_python() {
347 let lang: Language = tree_sitter_python::LANGUAGE.into();
348 let source = "def a():\n pass\n\ndef b():\n pass\n\nclass C:\n def m(self):\n pass\n";
349 let chunks = chunk_source(source, Some(&lang), 1500);
350 // Coverage: chunks should be non-overlapping when sorted by start_byte.
351 let mut prev_end = 0;
352 for c in &chunks {
353 assert!(
354 c.start_byte >= prev_end,
355 "chunk overlap: prev_end={prev_end} c.start_byte={}",
356 c.start_byte
357 );
358 prev_end = c.end_byte;
359 }
360 assert!(prev_end <= source.len(), "chunks extend past source");
361 }
362}