argyph_parse/
chunker.rs

1use crate::error::Result;
2use crate::types::{ByteRange, Chunk, ChunkId, ChunkKind};
3use argyph_fs::Language;
4use camino::Utf8PathBuf;
5use tree_sitter::Node;
6
7/// Build AST-aware chunks around named definition nodes.
8///
9/// Named constructs (functions, classes, structs, etc.) become their own
10/// chunks when they fit within `max_chunk_size`. Text between constructs
11/// becomes `TopLevel` chunks. Any node exceeding `max_chunk_size` is split
12/// into `Fallback` character-based chunks.
13pub fn ast_chunks<F, G>(
14    path: &Utf8PathBuf,
15    root: &Node,
16    source: &str,
17    language: Language,
18    max_chunk_size: usize,
19    kind_for_node: F,
20    is_boundary: G,
21) -> Result<Vec<Chunk>>
22where
23    F: Fn(&str) -> ChunkKind,
24    G: Fn(&str) -> bool,
25{
26    let source_len = source.len();
27    if source_len == 0 {
28        return Ok(Vec::new());
29    }
30
31    let mut boundaries: Vec<(usize, usize)> = Vec::new();
32    collect_boundaries(*root, &is_boundary, &mut boundaries);
33    boundaries.sort_by_key(|b| b.0);
34
35    let mut chunks = Vec::new();
36    let mut cursor: usize = 0;
37
38    for &(start, end) in &boundaries {
39        if start > cursor {
40            let gap_text = &source[cursor..start];
41            if !gap_text.trim().is_empty() {
42                for chunk in char_split(path, gap_text, cursor, language, max_chunk_size) {
43                    chunks.push(chunk);
44                }
45            }
46        }
47
48        let node_text = &source[start..end];
49        if node_text.len() <= max_chunk_size {
50            let node = find_node_at(*root, start, end);
51            let kind = node
52                .map(|n| kind_for_node(n.kind()))
53                .unwrap_or(ChunkKind::TopLevel);
54            let id = ChunkId::from_text(node_text);
55            chunks.push(Chunk {
56                id,
57                file: path.clone(),
58                range: ByteRange::new(start, end),
59                text: node_text.to_string(),
60                kind,
61                language,
62            });
63        } else {
64            for chunk in char_split(path, node_text, start, language, max_chunk_size) {
65                chunks.push(chunk);
66            }
67        }
68
69        cursor = end;
70    }
71
72    if cursor < source_len {
73        let remaining = &source[cursor..];
74        if !remaining.trim().is_empty() {
75            for chunk in char_split(path, remaining, cursor, language, max_chunk_size) {
76                chunks.push(chunk);
77            }
78        }
79    }
80
81    Ok(chunks)
82}
83
84fn collect_boundaries<F>(node: Node, is_boundary: &F, out: &mut Vec<(usize, usize)>)
85where
86    F: Fn(&str) -> bool,
87{
88    if is_boundary(node.kind()) {
89        let start = node.start_byte();
90        let end = node.end_byte();
91        if !out.iter().any(|&(s, e)| s <= start && e >= end) {
92            out.push((start, end));
93            return;
94        }
95    }
96    for i in 0..node.child_count() {
97        if let Some(child) = node.child(i as u32) {
98            collect_boundaries(child, is_boundary, out);
99        }
100    }
101}
102
103fn find_node_at<'a>(root: Node<'a>, start: usize, end: usize) -> Option<Node<'a>> {
104    if root.start_byte() == start && root.end_byte() == end {
105        return Some(root);
106    }
107    for i in 0..root.child_count() {
108        if let Some(child) = root.child(i as u32) {
109            if child.start_byte() <= start && child.end_byte() >= end {
110                if let Some(found) = find_node_at(child, start, end) {
111                    return Some(found);
112                }
113            }
114        }
115    }
116    None
117}
118
119/// Character-based fallback chunks for text that has no AST structure or
120/// for oversized nodes.
121pub fn char_split(
122    path: &Utf8PathBuf,
123    text: &str,
124    offset: usize,
125    language: Language,
126    max_size: usize,
127) -> Vec<Chunk> {
128    let mut chunks = Vec::new();
129    let mut pos = 0;
130
131    while pos < text.len() {
132        let mut end = (pos + max_size).min(text.len());
133        // Walk `end` back to the nearest char boundary so we never
134        // slice through a multi-byte UTF-8 codepoint. Files with
135        // non-ASCII content (e.g. CSVs with smart quotes or
136        // identifiers in Cyrillic, CJK, etc.) used to panic here.
137        while end > pos && !text.is_char_boundary(end) {
138            end -= 1;
139        }
140
141        let slice_end = if end < text.len() {
142            find_good_split(&text[pos..end]).unwrap_or(end - pos) + pos
143        } else {
144            end
145        };
146
147        let slice = &text[pos..slice_end];
148        let id = ChunkId::from_text(slice);
149        chunks.push(Chunk {
150            id,
151            file: path.clone(),
152            range: ByteRange::new(offset + pos, offset + slice_end),
153            text: slice.to_string(),
154            kind: ChunkKind::Fallback,
155            language,
156        });
157
158        pos = slice_end;
159    }
160
161    chunks
162}
163
164fn find_good_split(window: &str) -> Option<usize> {
165    for (i, ch) in window.char_indices().rev() {
166        if ch == '\n' && i > window.len() / 2 {
167            return Some(i + 1);
168        }
169    }
170    for (i, ch) in window.char_indices().rev() {
171        if ch == ' ' && i > window.len() / 2 {
172            return Some(i + 1);
173        }
174    }
175    None
176}
177
178/// Build fallback-only chunks for files with no recognized language.
179pub fn fallback_chunks(
180    path: &Utf8PathBuf,
181    source: &str,
182    max_size: usize,
183    language: Language,
184) -> Vec<Chunk> {
185    char_split(path, source, 0, language, max_size)
186}
187
188#[cfg(test)]
189mod tests {
190    use super::*;
191
192    #[test]
193    fn char_split_produces_multiple_chunks() {
194        let path = Utf8PathBuf::from("test.txt");
195        let text = "a".repeat(5000);
196        let chunks = char_split(&path, &text, 0, Language::Markdown, 1024);
197        assert!(chunks.len() >= 5);
198        for c in &chunks {
199            assert!(c.text.len() <= 1024 + 100);
200            assert_eq!(c.kind, ChunkKind::Fallback);
201        }
202    }
203
204    #[test]
205    fn char_split_empty_input() {
206        let path = Utf8PathBuf::from("empty.txt");
207        let chunks = char_split(&path, "", 0, Language::Markdown, 1024);
208        assert!(chunks.is_empty());
209    }
210
211    #[test]
212    fn char_split_splits_at_newline() {
213        let path = Utf8PathBuf::from("test.txt");
214        let text = "line one\nline two\nline three\nline four\n";
215        let chunks = char_split(&path, text, 0, Language::Markdown, 20);
216        assert!(chunks.len() >= 2);
217    }
218
219    #[test]
220    fn chunk_id_deterministic() {
221        let a = ChunkId::from_text("hello world");
222        let b = ChunkId::from_text("hello world");
223        assert_eq!(a, b);
224    }
225
226    #[test]
227    fn chunk_id_different_for_different_text() {
228        let a = ChunkId::from_text("hello world");
229        let b = ChunkId::from_text("goodbye world");
230        assert_ne!(a, b);
231    }
232
233    #[test]
234    fn char_split_handles_multibyte_utf8_at_window_edge() {
235        // Repro for the panic where `end` would land in the middle of
236        // a multi-byte UTF-8 character (e.g. Cyrillic letters used in
237        // ripgrep's benchsuite data files). The chunker must not
238        // panic — it must walk back to a char boundary.
239        let path = Utf8PathBuf::from("test.txt");
240        // 1024-byte target window deliberately straddled by 'т' (2 bytes).
241        let prefix = "a".repeat(1023);
242        let text = format!("{prefix}тbcdefgh");
243        let chunks = char_split(&path, &text, 0, Language::Markdown, 1024);
244        assert!(!chunks.is_empty());
245        for c in &chunks {
246            // Round-tripping through &str proves every slice landed
247            // on a char boundary.
248            let _ = c.text.as_str();
249        }
250    }
251
252    #[test]
253    fn chunk_id_whitespace_normalized() {
254        let a = ChunkId::from_text("hello   world");
255        let b = ChunkId::from_text("hello world");
256        assert_eq!(a, b);
257    }
258}
argyph_parse/chunker.rs

argyph_parse/
chunker.rs