1use std::path::Path;
7use std::sync::atomic::{AtomicU64, Ordering};
8
9use tree_sitter::Node;
10
11use crate::error::ParserError;
12use crate::parser::treesitter::SupportedLanguage;
13use crate::parser::{ChunkKind, CodeChunk, ParseResult};
14
15static CHUNK_ID_COUNTER: AtomicU64 = AtomicU64::new(1);
17
18fn next_chunk_id() -> u64 {
20 CHUNK_ID_COUNTER.fetch_add(1, Ordering::Relaxed)
21}
22
23const MIN_CHUNK_LINES: usize = 2;
25
26const FALLBACK_CHUNK_SIZE: usize = 50;
28
29pub fn chunk_file(
31 path: &Path,
32 source: &str,
33 lang: SupportedLanguage,
34) -> Result<ParseResult, ParserError> {
35 let tree = crate::parser::treesitter::parse_source(source, lang)?;
36 let root = tree.root_node();
37
38 let chunk_kinds = lang.chunk_node_kinds();
39
40 let mut chunks = Vec::new();
41
42 if chunk_kinds.is_empty() {
43 if source.lines().count() <= FALLBACK_CHUNK_SIZE * 2 {
46 chunks.push(CodeChunk {
47 id: next_chunk_id(),
48 file_path: path.to_path_buf(),
49 language: lang.name().to_string(),
50 kind: ChunkKind::Block,
51 name: path.file_name().and_then(|f| f.to_str()).map(String::from),
52 signature: None,
53 doc_comment: None,
54 body: source.to_string(),
55 byte_range: 0..source.len(),
56 line_range: 0..source.lines().count(),
57 });
58 }
59 } else {
60 extract_chunks_recursive(&root, source, path, lang, chunk_kinds, &mut chunks);
62
63 if chunks.is_empty() {
65 chunks = fallback_line_chunks(path, source, lang);
66 }
67 }
68
69 Ok(ParseResult {
70 chunks,
71 language: lang.name().to_string(),
72 })
73}
74
75fn extract_chunks_recursive(
77 node: &Node,
78 source: &str,
79 file_path: &Path,
80 lang: SupportedLanguage,
81 chunk_kinds: &[&str],
82 chunks: &mut Vec<CodeChunk>,
83) {
84 let kind = node.kind();
85
86 if chunk_kinds.contains(&kind) {
87 if let Some(chunk) = node_to_chunk(node, source, file_path, lang) {
88 let line_count = chunk.line_range.end - chunk.line_range.start;
90 if line_count >= MIN_CHUNK_LINES {
91 chunks.push(chunk);
92 }
93 }
94 if should_recurse_into(kind) {
98 let mut cursor = node.walk();
99 if cursor.goto_first_child() {
100 loop {
101 let child = cursor.node();
102 extract_chunks_recursive(&child, source, file_path, lang, chunk_kinds, chunks);
103 if !cursor.goto_next_sibling() {
104 break;
105 }
106 }
107 }
108 }
109 } else {
110 let mut cursor = node.walk();
112 if cursor.goto_first_child() {
113 loop {
114 let child = cursor.node();
115 extract_chunks_recursive(&child, source, file_path, lang, chunk_kinds, chunks);
116 if !cursor.goto_next_sibling() {
117 break;
118 }
119 }
120 }
121 }
122}
123
124fn should_recurse_into(kind: &str) -> bool {
126 matches!(
127 kind,
128 "impl_item"
129 | "class_declaration"
130 | "class_definition"
131 | "class_specifier"
132 | "interface_declaration"
133 | "namespace_definition"
134 | "module"
135 | "mod_item"
136 | "export_statement"
137 | "decorated_definition"
138 )
139}
140
141fn node_to_chunk(
143 node: &Node,
144 source: &str,
145 file_path: &Path,
146 lang: SupportedLanguage,
147) -> Option<CodeChunk> {
148 let start_byte = node.start_byte();
149 let end_byte = node.end_byte();
150
151 if end_byte <= start_byte || end_byte > source.len() {
152 return None;
153 }
154
155 let body = source[start_byte..end_byte].to_string();
156 let start_line = node.start_position().row;
157 let end_line = node.end_position().row + 1; let kind = classify_node_kind(node.kind(), lang);
160 let name = extract_node_name(node, source);
161 let signature = extract_signature(node, source, lang);
162 let doc_comment = extract_doc_comment(node, source, start_line);
163
164 Some(CodeChunk {
165 id: next_chunk_id(),
166 file_path: file_path.to_path_buf(),
167 language: lang.name().to_string(),
168 kind,
169 name,
170 signature,
171 doc_comment,
172 body,
173 byte_range: start_byte..end_byte,
174 line_range: start_line..end_line,
175 })
176}
177
178fn classify_node_kind(ts_kind: &str, _lang: SupportedLanguage) -> ChunkKind {
180 match ts_kind {
181 "function_item" | "function_definition" | "function_declaration" | "arrow_function" => {
183 ChunkKind::Function
184 }
185 "method_definition"
187 | "method_declaration"
188 | "method"
189 | "singleton_method"
190 | "constructor_declaration" => ChunkKind::Method,
191 "class_declaration" | "class_definition" | "class_specifier" => ChunkKind::Class,
193 "struct_item" | "struct_specifier" => ChunkKind::Struct,
195 "enum_item" | "enum_declaration" | "enum_specifier" => ChunkKind::Enum,
197 "interface_declaration" | "trait_item" => ChunkKind::Interface,
199 "mod_item" | "namespace_definition" | "module" => ChunkKind::Module,
201 "impl_item" => ChunkKind::Module,
203 _ => ChunkKind::Block,
205 }
206}
207
208fn extract_node_name(node: &Node, source: &str) -> Option<String> {
210 for field_name in &["name", "declarator"] {
212 if let Some(name_node) = node.child_by_field_name(field_name) {
213 let name = &source[name_node.start_byte()..name_node.end_byte()];
214 return Some(name.to_string());
215 }
216 }
217
218 let mut cursor = node.walk();
220 if cursor.goto_first_child() {
221 loop {
222 let child = cursor.node();
223 if child.kind() == "identifier" || child.kind() == "type_identifier" {
224 let name = &source[child.start_byte()..child.end_byte()];
225 return Some(name.to_string());
226 }
227 if !cursor.goto_next_sibling() {
228 break;
229 }
230 }
231 }
232
233 None
234}
235
236fn extract_signature(node: &Node, source: &str, _lang: SupportedLanguage) -> Option<String> {
238 let body = &source[node.start_byte()..node.end_byte()];
239
240 if let Some(pos) = body.find('{') {
242 let sig = body[..pos].trim();
243 if !sig.is_empty() {
244 return Some(sig.to_string());
245 }
246 }
247
248 if let Some(pos) = body.find(':') {
250 let before_colon = &body[..pos];
252 if before_colon.contains("def ") || before_colon.contains("class ") {
253 let sig = body[..=pos].trim();
254 if !sig.is_empty() {
255 return Some(sig.to_string());
256 }
257 }
258 }
259
260 let first_line = body.lines().next().map(|l| l.trim().to_string());
262 first_line.filter(|l| !l.is_empty())
263}
264
265fn extract_doc_comment(node: &Node, source: &str, _node_start_line: usize) -> Option<String> {
267 let mut prev = node.prev_sibling();
269 let mut comments = Vec::new();
270
271 while let Some(sibling) = prev {
272 let kind = sibling.kind();
273 if kind == "line_comment" || kind == "comment" || kind == "block_comment" {
274 let text = &source[sibling.start_byte()..sibling.end_byte()];
275 comments.push(text.to_string());
276 prev = sibling.prev_sibling();
277 } else {
278 break;
279 }
280 }
281
282 if comments.is_empty() {
283 return None;
284 }
285
286 comments.reverse();
288 let combined = comments.join("\n");
289
290 let cleaned: String = combined
292 .lines()
293 .map(|line| {
294 let trimmed = line.trim();
295 if let Some(stripped) = trimmed.strip_prefix("///") {
296 stripped.trim().to_string()
297 } else if let Some(stripped) = trimmed.strip_prefix("//!") {
298 stripped.trim().to_string()
299 } else if let Some(stripped) = trimmed.strip_prefix("//") {
300 stripped.trim().to_string()
301 } else if let Some(stripped) = trimmed.strip_prefix('#') {
302 stripped.trim().to_string()
303 } else {
304 trimmed.to_string()
305 }
306 })
307 .collect::<Vec<_>>()
308 .join("\n");
309
310 if cleaned.trim().is_empty() {
311 None
312 } else {
313 Some(cleaned)
314 }
315}
316
317fn fallback_line_chunks(file_path: &Path, source: &str, lang: SupportedLanguage) -> Vec<CodeChunk> {
319 let lines: Vec<&str> = source.lines().collect();
320 let total_lines = lines.len();
321
322 if total_lines == 0 {
323 return Vec::new();
324 }
325
326 let mut chunks = Vec::new();
327 let mut offset = 0;
328
329 for chunk_start in (0..total_lines).step_by(FALLBACK_CHUNK_SIZE) {
330 let chunk_end = (chunk_start + FALLBACK_CHUNK_SIZE).min(total_lines);
331 let chunk_lines = &lines[chunk_start..chunk_end];
332 let body = chunk_lines.join("\n");
333 let byte_start = offset;
334 let byte_end = offset + body.len();
335 offset = byte_end + 1; chunks.push(CodeChunk {
338 id: next_chunk_id(),
339 file_path: file_path.to_path_buf(),
340 language: lang.name().to_string(),
341 kind: ChunkKind::Block,
342 name: Some(format!(
343 "{}:L{}-L{}",
344 file_path.file_name().unwrap_or_default().to_string_lossy(),
345 chunk_start + 1,
346 chunk_end
347 )),
348 signature: None,
349 doc_comment: None,
350 body,
351 byte_range: byte_start..byte_end,
352 line_range: chunk_start..chunk_end,
353 });
354 }
355
356 chunks
357}
358
359pub fn chunk_file_from_path(path: &Path) -> Result<Option<ParseResult>, ParserError> {
361 let lang = match SupportedLanguage::from_path(path) {
362 Some(l) => l,
363 None => return Ok(None), };
365
366 let source = std::fs::read_to_string(path).map_err(ParserError::Io)?;
367
368 if crate::scanner::filter::is_binary_content(source.as_bytes()) {
370 return Ok(None);
371 }
372
373 let result = chunk_file(path, &source, lang)?;
374 Ok(Some(result))
375}
376
377#[cfg(test)]
378mod tests {
379 use super::*;
380
381 #[test]
382 fn test_chunk_rust_file() {
383 let source = r#"
384/// A greeting function.
385fn greet(name: &str) -> String {
386 format!("Hello, {}!", name)
387}
388
389/// A struct.
390struct User {
391 name: String,
392 age: u32,
393}
394
395impl User {
396 fn new(name: String, age: u32) -> Self {
397 Self { name, age }
398 }
399
400 fn display(&self) -> String {
401 format!("{} ({})", self.name, self.age)
402 }
403}
404"#;
405 let result = chunk_file(Path::new("test.rs"), source, SupportedLanguage::Rust).unwrap();
406
407 assert_eq!(result.language, "rust");
408 assert!(
409 !result.chunks.is_empty(),
410 "Should find at least some chunks"
411 );
412
413 let greet = result
415 .chunks
416 .iter()
417 .find(|c| c.name.as_deref() == Some("greet"));
418 assert!(greet.is_some(), "Should find greet function");
419
420 let greet = greet.unwrap();
421 assert_eq!(greet.kind, ChunkKind::Function);
422 assert!(greet.doc_comment.is_some(), "Should extract doc comment");
423 assert!(greet.signature.is_some(), "Should extract signature");
424 }
425
426 #[test]
427 fn test_chunk_python_file() {
428 let source = r#"
429class Calculator:
430 """A simple calculator."""
431
432 def add(self, a: int, b: int) -> int:
433 """Add two numbers."""
434 return a + b
435
436 def subtract(self, a: int, b: int) -> int:
437 return a - b
438
439def standalone_function(x: str) -> bool:
440 return len(x) > 0
441"#;
442 let result = chunk_file(Path::new("calc.py"), source, SupportedLanguage::Python).unwrap();
443
444 assert_eq!(result.language, "python");
445 assert!(!result.chunks.is_empty());
446 }
447
448 #[test]
449 fn test_chunk_javascript_file() {
450 let source = r#"
451function fetchData(url) {
452 return fetch(url).then(r => r.json());
453}
454
455class EventEmitter {
456 constructor() {
457 this.listeners = {};
458 }
459
460 on(event, callback) {
461 this.listeners[event] = callback;
462 }
463}
464"#;
465 let result =
466 chunk_file(Path::new("app.js"), source, SupportedLanguage::JavaScript).unwrap();
467
468 assert_eq!(result.language, "javascript");
469 assert!(!result.chunks.is_empty());
470 }
471
472 #[test]
473 fn test_fallback_chunking() {
474 let source = "#!/bin/bash\necho 'hello'\necho 'world'\n".repeat(30);
476 let result = chunk_file(Path::new("script.sh"), &source, SupportedLanguage::Bash).unwrap();
477
478 assert!(!result.chunks.is_empty());
480 }
481}