1use crate::error::Result;
2use crate::types::{ByteRange, Chunk, ChunkId, ChunkKind};
3use argyph_fs::Language;
4use camino::Utf8PathBuf;
5use tree_sitter::Node;
6
7pub fn ast_chunks<F, G>(
14 path: &Utf8PathBuf,
15 root: &Node,
16 source: &str,
17 language: Language,
18 max_chunk_size: usize,
19 kind_for_node: F,
20 is_boundary: G,
21) -> Result<Vec<Chunk>>
22where
23 F: Fn(&str) -> ChunkKind,
24 G: Fn(&str) -> bool,
25{
26 let source_len = source.len();
27 if source_len == 0 {
28 return Ok(Vec::new());
29 }
30
31 let mut boundaries: Vec<(usize, usize)> = Vec::new();
32 collect_boundaries(*root, &is_boundary, &mut boundaries);
33 boundaries.sort_by_key(|b| b.0);
34
35 let mut chunks = Vec::new();
36 let mut cursor: usize = 0;
37
38 for &(start, end) in &boundaries {
39 if start > cursor {
40 let gap_text = &source[cursor..start];
41 if !gap_text.trim().is_empty() {
42 for chunk in char_split(path, gap_text, cursor, language, max_chunk_size) {
43 chunks.push(chunk);
44 }
45 }
46 }
47
48 let node_text = &source[start..end];
49 if node_text.len() <= max_chunk_size {
50 let node = find_node_at(*root, start, end);
51 let kind = node
52 .map(|n| kind_for_node(n.kind()))
53 .unwrap_or(ChunkKind::TopLevel);
54 let id = ChunkId::from_text(node_text);
55 chunks.push(Chunk {
56 id,
57 file: path.clone(),
58 range: ByteRange::new(start, end),
59 text: node_text.to_string(),
60 kind,
61 language,
62 });
63 } else {
64 for chunk in char_split(path, node_text, start, language, max_chunk_size) {
65 chunks.push(chunk);
66 }
67 }
68
69 cursor = end;
70 }
71
72 if cursor < source_len {
73 let remaining = &source[cursor..];
74 if !remaining.trim().is_empty() {
75 for chunk in char_split(path, remaining, cursor, language, max_chunk_size) {
76 chunks.push(chunk);
77 }
78 }
79 }
80
81 Ok(chunks)
82}
83
84fn collect_boundaries<F>(node: Node, is_boundary: &F, out: &mut Vec<(usize, usize)>)
85where
86 F: Fn(&str) -> bool,
87{
88 if is_boundary(node.kind()) {
89 let start = node.start_byte();
90 let end = node.end_byte();
91 if !out.iter().any(|&(s, e)| s <= start && e >= end) {
92 out.push((start, end));
93 return;
94 }
95 }
96 for i in 0..node.child_count() {
97 if let Some(child) = node.child(i as u32) {
98 collect_boundaries(child, is_boundary, out);
99 }
100 }
101}
102
103fn find_node_at<'a>(root: Node<'a>, start: usize, end: usize) -> Option<Node<'a>> {
104 if root.start_byte() == start && root.end_byte() == end {
105 return Some(root);
106 }
107 for i in 0..root.child_count() {
108 if let Some(child) = root.child(i as u32) {
109 if child.start_byte() <= start && child.end_byte() >= end {
110 if let Some(found) = find_node_at(child, start, end) {
111 return Some(found);
112 }
113 }
114 }
115 }
116 None
117}
118
119pub fn char_split(
122 path: &Utf8PathBuf,
123 text: &str,
124 offset: usize,
125 language: Language,
126 max_size: usize,
127) -> Vec<Chunk> {
128 let mut chunks = Vec::new();
129 let mut pos = 0;
130
131 while pos < text.len() {
132 let mut end = (pos + max_size).min(text.len());
133 while end > pos && !text.is_char_boundary(end) {
138 end -= 1;
139 }
140
141 let slice_end = if end < text.len() {
142 find_good_split(&text[pos..end]).unwrap_or(end - pos) + pos
143 } else {
144 end
145 };
146
147 let slice = &text[pos..slice_end];
148 let id = ChunkId::from_text(slice);
149 chunks.push(Chunk {
150 id,
151 file: path.clone(),
152 range: ByteRange::new(offset + pos, offset + slice_end),
153 text: slice.to_string(),
154 kind: ChunkKind::Fallback,
155 language,
156 });
157
158 pos = slice_end;
159 }
160
161 chunks
162}
163
164fn find_good_split(window: &str) -> Option<usize> {
165 for (i, ch) in window.char_indices().rev() {
166 if ch == '\n' && i > window.len() / 2 {
167 return Some(i + 1);
168 }
169 }
170 for (i, ch) in window.char_indices().rev() {
171 if ch == ' ' && i > window.len() / 2 {
172 return Some(i + 1);
173 }
174 }
175 None
176}
177
178pub fn fallback_chunks(
180 path: &Utf8PathBuf,
181 source: &str,
182 max_size: usize,
183 language: Language,
184) -> Vec<Chunk> {
185 char_split(path, source, 0, language, max_size)
186}
187
188#[cfg(test)]
189mod tests {
190 use super::*;
191
192 #[test]
193 fn char_split_produces_multiple_chunks() {
194 let path = Utf8PathBuf::from("test.txt");
195 let text = "a".repeat(5000);
196 let chunks = char_split(&path, &text, 0, Language::Markdown, 1024);
197 assert!(chunks.len() >= 5);
198 for c in &chunks {
199 assert!(c.text.len() <= 1024 + 100);
200 assert_eq!(c.kind, ChunkKind::Fallback);
201 }
202 }
203
204 #[test]
205 fn char_split_empty_input() {
206 let path = Utf8PathBuf::from("empty.txt");
207 let chunks = char_split(&path, "", 0, Language::Markdown, 1024);
208 assert!(chunks.is_empty());
209 }
210
211 #[test]
212 fn char_split_splits_at_newline() {
213 let path = Utf8PathBuf::from("test.txt");
214 let text = "line one\nline two\nline three\nline four\n";
215 let chunks = char_split(&path, text, 0, Language::Markdown, 20);
216 assert!(chunks.len() >= 2);
217 }
218
219 #[test]
220 fn chunk_id_deterministic() {
221 let a = ChunkId::from_text("hello world");
222 let b = ChunkId::from_text("hello world");
223 assert_eq!(a, b);
224 }
225
226 #[test]
227 fn chunk_id_different_for_different_text() {
228 let a = ChunkId::from_text("hello world");
229 let b = ChunkId::from_text("goodbye world");
230 assert_ne!(a, b);
231 }
232
233 #[test]
234 fn char_split_handles_multibyte_utf8_at_window_edge() {
235 let path = Utf8PathBuf::from("test.txt");
240 let prefix = "a".repeat(1023);
242 let text = format!("{prefix}тbcdefgh");
243 let chunks = char_split(&path, &text, 0, Language::Markdown, 1024);
244 assert!(!chunks.is_empty());
245 for c in &chunks {
246 let _ = c.text.as_str();
249 }
250 }
251
252 #[test]
253 fn chunk_id_whitespace_normalized() {
254 let a = ChunkId::from_text("hello world");
255 let b = ChunkId::from_text("hello world");
256 assert_eq!(a, b);
257 }
258}