Skip to main content

ripvec_core/encoder/ripvec/
chunking.rs

1//! Tree-sitter AST-merge chunker (semble flavor).
2//!
3//! Port of `~/src/semble/src/semble/chunking/{core,chunking}.py`.
4//! Greedily walks AST children, packing siblings up to a 1500-char
5//! target before recursing into oversized nodes. Falls back to
6//! line-merge chunking for unsupported languages.
7//!
8//! The algorithm differs from ripvec's query-based chunker in
9//! [`crate::chunk`]: it does not extract specific definitions, it merges
10//! AST subtrees until they fill the size budget. This produces fewer,
11//! larger, more contextually-coherent chunks for semantic embedding.
12//!
13//! ## Boundary shape
14//!
15//! [`ChunkBoundary`] reports byte offsets (matching tree-sitter's
16//! native units) and 1-based line numbers (matching CodeChunk's
17//! convention). Callers usually pass [`ChunkBoundary::content`] +
18//! line numbers into [`CodeChunk`] construction.
19
20use tree_sitter::{Language, Node, Parser, TreeCursor};
21
22/// Default desired chunk length used by [`chunk_source`] when callers
23/// don't specify one (mirrors `_DESIRED_CHUNK_LENGTH_CHARS` from
24/// `chunking.py:10`).
25pub const DEFAULT_DESIRED_CHUNK_CHARS: usize = 1500;
26
27/// Inclusive byte span and (1-based) line range for one chunk.
28#[derive(Debug, Clone, PartialEq, Eq)]
29pub struct ChunkBoundary {
30    /// First byte of the chunk (UTF-8 byte offset).
31    pub start_byte: usize,
32    /// One-past-last byte of the chunk.
33    pub end_byte: usize,
34    /// 1-based start line.
35    pub start_line: usize,
36    /// 1-based end line.
37    pub end_line: usize,
38}
39
40impl ChunkBoundary {
41    /// Slice the chunk's content out of the source string.
42    ///
43    /// Caller guarantees `start_byte..end_byte` lies on UTF-8 boundaries
44    /// (tree-sitter node positions always do).
45    #[must_use]
46    pub fn content<'a>(&self, source: &'a str) -> &'a str {
47        // Clamp end_byte to source length defensively (in case a node's
48        // end position lies one past the source end, e.g., for trailing
49        // EOF tokens). Also walk back to the previous UTF-8 char
50        // boundary at each end so the slice never panics on
51        // multi-byte glyphs (tree-sitter node positions are byte-
52        // accurate but our adjacent-chunk merges occasionally widen the
53        // span; defence in depth).
54        let end = clamp_to_char_boundary(source, self.end_byte.min(source.len()), true);
55        let start = clamp_to_char_boundary(source, self.start_byte.min(end), false);
56        &source[start..end]
57    }
58}
59
60/// Walk `byte` to the nearest UTF-8 character boundary. `forward = true`
61/// rounds up (to the byte right after a multi-byte glyph); `false`
62/// rounds down (to the start of one). `&str::is_char_boundary` is the
63/// gate; the loop is bounded by a multi-byte glyph max width (4).
64fn clamp_to_char_boundary(source: &str, mut byte: usize, forward: bool) -> usize {
65    byte = byte.min(source.len());
66    for _ in 0..4 {
67        if source.is_char_boundary(byte) {
68            return byte;
69        }
70        if forward {
71            byte = (byte + 1).min(source.len());
72        } else {
73            byte = byte.saturating_sub(1);
74        }
75    }
76    byte
77}
78
79// ---------------------------------------------------------------------------
80// Public API.
81// ---------------------------------------------------------------------------
82
83/// Chunk source text via tree-sitter AST-merge or line-merge fallback.
84///
85/// `language = Some(lang)` parses with tree-sitter and runs the AST-merge
86/// algorithm. `language = None` runs the line-merge fallback.
87///
88/// Empty / whitespace-only input returns an empty `Vec`.
89#[must_use]
90pub fn chunk_source(
91    source: &str,
92    language: Option<&Language>,
93    desired_length: usize,
94) -> Vec<ChunkBoundary> {
95    if source.trim().is_empty() {
96        return Vec::new();
97    }
98    if let Some(lang) = language
99        && let Some(tree) = parse_source(source, lang)
100    {
101        return chunk_tree(source, tree.root_node(), desired_length);
102    }
103    chunk_lines(source, desired_length)
104}
105
106/// Chunk source by lines, merging adjacent lines up to `desired_length`.
107///
108/// Mirrors `chunk_lines` from `core.py:112`. The fallback chunker for
109/// files without a tree-sitter grammar (or whose parse fails).
110#[must_use]
111pub fn chunk_lines(source: &str, desired_length: usize) -> Vec<ChunkBoundary> {
112    if source.trim().is_empty() {
113        return Vec::new();
114    }
115    // Build per-line boundaries (start..end_inclusive of '\n').
116    let mut line_boundaries: Vec<ChunkBoundary> = Vec::new();
117    let mut start = 0;
118    let mut line_no = 1;
119    for line in source.split_inclusive('\n') {
120        let end = start + line.len();
121        // Lines from split_inclusive may end in '\n'; tree-sitter convention is
122        // 1-based start; the line itself spans [start, end) bytes.
123        let line_count = line.bytes().filter(|&b| b == b'\n').count().max(1);
124        line_boundaries.push(ChunkBoundary {
125            start_byte: start,
126            end_byte: end,
127            start_line: line_no,
128            end_line: line_no + line_count - 1,
129        });
130        start = end;
131        line_no += line_count;
132    }
133    merge_adjacent_chunks(line_boundaries, desired_length, source)
134}
135
136// ---------------------------------------------------------------------------
137// Tree-sitter path.
138// ---------------------------------------------------------------------------
139
140fn parse_source(source: &str, language: &Language) -> Option<tree_sitter::Tree> {
141    let mut parser = Parser::new();
142    parser.set_language(language).ok()?;
143    parser.parse(source, None)
144}
145
146fn chunk_tree(source: &str, root: Node<'_>, desired_length: usize) -> Vec<ChunkBoundary> {
147    let raw = merge_node_inner(root, desired_length);
148    let with_lines = raw
149        .into_iter()
150        .map(|(start, end)| ChunkBoundary {
151            start_byte: start,
152            end_byte: end,
153            start_line: line_at_byte(source, start),
154            end_line: line_at_byte(source, end.saturating_sub(1).max(start)),
155        })
156        .collect();
157    merge_adjacent_chunks(with_lines, desired_length, source)
158}
159
160fn line_at_byte(source: &str, byte: usize) -> usize {
161    // 1-based line number: count newlines in the source up to `byte`.
162    // Slice on `as_bytes()` rather than the &str so we can land on any
163    // byte position — `&source[..n]` panics when `n` is mid-character
164    // (e.g., for a 3-byte glyph like '─' (U+2500) at bytes [k, k+1, k+2],
165    // `source[..k+2]` is a char-boundary violation). The callers compute
166    // `end.saturating_sub(1)` for end-line which often lands inside a
167    // multi-byte glyph; byte-level slicing is the right primitive.
168    let clamped = byte.min(source.len());
169    1 + bytecount::count(&source.as_bytes()[..clamped], b'\n')
170}
171
172/// Walk a node's children depth-first, packing siblings into spans up
173/// to `desired_length` bytes. Mirrors `_merge_node_inner` from
174/// `core.py:65`. Returns byte spans `(start, end)`.
175fn merge_node_inner(node: Node<'_>, desired_length: usize) -> Vec<(usize, usize)> {
176    let mut cursor = node.walk();
177    if !cursor.goto_first_child() {
178        // Leaf node — single span.
179        return vec![(node.start_byte(), node.end_byte())];
180    }
181
182    let mut groups: Vec<(usize, usize)> = Vec::new();
183    loop {
184        let child = cursor.node();
185        let start = child.start_byte();
186        let mut end = child.end_byte();
187        let mut length = end - start;
188
189        if length > desired_length {
190            // Recurse into oversized children.
191            groups.extend(merge_node_inner(child, desired_length));
192            if !cursor.goto_next_sibling() {
193                break;
194            }
195            continue;
196        }
197
198        // Extend the current group with subsequent siblings that fit.
199        while let Some(next) = peek_next_sibling(&cursor) {
200            let next_length = next.end_byte() - next.start_byte();
201            if length + next_length > desired_length {
202                break;
203            }
204            cursor.goto_next_sibling();
205            end = cursor.node().end_byte();
206            length += next_length;
207        }
208
209        groups.push((start, end));
210        if !cursor.goto_next_sibling() {
211            break;
212        }
213    }
214    groups
215}
216
217/// Peek at the next sibling without advancing the cursor.
218///
219/// `TreeCursor` has no native peek; clone the cursor, advance, and read
220/// the node. Tree-sitter cursors are cheap to clone.
221fn peek_next_sibling<'a>(cursor: &TreeCursor<'a>) -> Option<Node<'a>> {
222    let mut probe = cursor.clone();
223    if probe.goto_next_sibling() {
224        Some(probe.node())
225    } else {
226        None
227    }
228}
229
230/// Merge adjacent chunks up to `desired_length`. Mirrors
231/// `_merge_adjacent_chunks` from `core.py:35`. Operates on byte spans.
232fn merge_adjacent_chunks(
233    chunks: Vec<ChunkBoundary>,
234    desired_length: usize,
235    source: &str,
236) -> Vec<ChunkBoundary> {
237    if chunks.is_empty() {
238        return chunks;
239    }
240    let mut merged: Vec<ChunkBoundary> = Vec::new();
241    let mut current = chunks[0].clone();
242    let mut current_length = current.end_byte - current.start_byte;
243
244    for next in chunks.into_iter().skip(1) {
245        let nlen = next.end_byte - next.start_byte;
246        if current_length + nlen > desired_length {
247            merged.push(current);
248            current = next;
249            current_length = nlen;
250        } else {
251            current.end_byte = next.end_byte;
252            current.end_line = next.end_line;
253            current_length += nlen;
254        }
255    }
256    merged.push(current);
257    // Recompute end_line at boundaries to be safe (the byte-span recipe
258    // can leave end_line stale when merging across multi-line blocks).
259    for b in &mut merged {
260        b.end_line = line_at_byte(source, b.end_byte.saturating_sub(1).max(b.start_byte));
261    }
262    merged
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    /// `test:chunker-line-fallback` — line-merge fallback packs lines up
270    /// to the budget; lines longer than the budget get their own chunk.
271    #[test]
272    fn chunker_line_fallback() {
273        let source = "aaa\nbbbbb\nccc\nddd\n";
274        // budget=8 should merge "aaa\n" (4) + "ccc\n" (4)? No — adjacency.
275        // Actually: line1=4, line2=6 (bbbbb\n), line3=4, line4=4.
276        // current=line1 (len 4). next line2 (6). 4+6=10 > 8, flush. current=line2.
277        // next line3 (4). 6+4=10 > 8, flush. current=line3.
278        // next line4 (4). 4+4=8 <= 8, merge. current=line3+line4 (8).
279        let chunks = chunk_lines(source, 8);
280        assert_eq!(chunks.len(), 3);
281        assert_eq!(chunks[0].content(source), "aaa\n");
282        assert_eq!(chunks[1].content(source), "bbbbb\n");
283        assert_eq!(chunks[2].content(source), "ccc\nddd\n");
284    }
285
286    /// Empty input returns empty.
287    #[test]
288    fn empty_input_returns_empty() {
289        assert_eq!(chunk_lines("", 100), Vec::<ChunkBoundary>::new());
290        assert_eq!(chunk_lines("   \n  ", 100), Vec::<ChunkBoundary>::new());
291        assert_eq!(chunk_source("", None, 100), Vec::<ChunkBoundary>::new());
292    }
293
294    /// `test:chunker-merge-adjacent` — adjacent small AST nodes pack
295    /// together up to the byte budget. Uses a Python source that has
296    /// short adjacent function definitions.
297    #[test]
298    fn chunker_merge_adjacent() {
299        let lang: Language = tree_sitter_python::LANGUAGE.into();
300        // Three small functions; combined ~80 chars. With budget 200 they should
301        // merge into one chunk. With budget 50, each stands alone.
302        let source = "def a():\n    return 1\n\ndef b():\n    return 2\n\ndef c():\n    return 3\n";
303        let combined = chunk_source(source, Some(&lang), 200);
304        let split = chunk_source(source, Some(&lang), 50);
305        assert!(
306            combined.len() <= split.len(),
307            "larger budget should produce fewer-or-equal chunks; got combined={} split={}",
308            combined.len(),
309            split.len()
310        );
311        // The content cover the whole source modulo tree-sitter trailing-whitespace
312        // semantics.
313        let combined_chars: usize = combined.iter().map(|c| c.end_byte - c.start_byte).sum();
314        assert!(
315            combined_chars >= source.len() - 4,
316            "chunks should cover ~all of source ({combined_chars} of {})",
317            source.len()
318        );
319    }
320
321    /// `test:chunker-recurses-oversized-nodes` — a single oversized
322    /// node splits into multiple smaller chunks.
323    #[test]
324    fn chunker_recurses_oversized_nodes() {
325        let lang: Language = tree_sitter_python::LANGUAGE.into();
326        // Build a function with many statements so its body exceeds the budget;
327        // tree-sitter should let merge_node_inner recurse into the function body
328        // and split it.
329        let mut body = String::new();
330        for i in 0..40 {
331            use std::fmt::Write;
332            let _ = writeln!(&mut body, "    x{i} = {i}");
333        }
334        let source = format!("def big_function():\n{body}");
335        let chunks = chunk_source(&source, Some(&lang), 100);
336        assert!(
337            chunks.len() > 1,
338            "oversized function should split into multiple chunks; got {}",
339            chunks.len()
340        );
341    }
342
343    /// `property:chunker-parity-python` — chunks cover the full source
344    /// without overlap and without leaving gaps (modulo whitespace).
345    #[test]
346    fn property_chunker_parity_python() {
347        let lang: Language = tree_sitter_python::LANGUAGE.into();
348        let source = "def a():\n    pass\n\ndef b():\n    pass\n\nclass C:\n    def m(self):\n        pass\n";
349        let chunks = chunk_source(source, Some(&lang), 1500);
350        // Coverage: chunks should be non-overlapping when sorted by start_byte.
351        let mut prev_end = 0;
352        for c in &chunks {
353            assert!(
354                c.start_byte >= prev_end,
355                "chunk overlap: prev_end={prev_end} c.start_byte={}",
356                c.start_byte
357            );
358            prev_end = c.end_byte;
359        }
360        assert!(prev_end <= source.len(), "chunks extend past source");
361    }
362}