1use std::path::Path;
7use std::sync::atomic::{AtomicU64, Ordering};
8
9use tree_sitter::Node;
10
11use crate::error::ParserError;
12use crate::parser::treesitter::SupportedLanguage;
13use crate::parser::{ChunkKind, CodeChunk, ParseResult};
14
15static CHUNK_ID_COUNTER: AtomicU64 = AtomicU64::new(1);
17
18fn next_chunk_id() -> u64 {
20 CHUNK_ID_COUNTER.fetch_add(1, Ordering::Relaxed)
21}
22
23const MIN_CHUNK_LINES: usize = 2;
25
26const FALLBACK_CHUNK_SIZE: usize = 50;
28
29pub fn chunk_file(
31 path: &Path,
32 source: &str,
33 lang: SupportedLanguage,
34) -> Result<ParseResult, ParserError> {
35 let tree = crate::parser::treesitter::parse_source(source, lang)?;
36 let root = tree.root_node();
37
38 let chunk_kinds = lang.chunk_node_kinds();
39
40 let mut chunks = Vec::new();
41
42 if chunk_kinds.is_empty() {
43 if source.lines().count() <= FALLBACK_CHUNK_SIZE * 2 {
46 chunks.push(CodeChunk {
47 id: next_chunk_id(),
48 file_path: path.to_path_buf(),
49 language: lang.name().to_string(),
50 kind: ChunkKind::Block,
51 name: path.file_name().and_then(|f| f.to_str()).map(String::from),
52 signature: None,
53 doc_comment: None,
54 body: source.to_string(),
55 byte_range: 0..source.len(),
56 line_range: 0..source.lines().count(),
57 });
58 }
59 } else {
60 extract_chunks_recursive(
62 &root,
63 source,
64 path,
65 lang,
66 chunk_kinds,
67 &mut chunks,
68 );
69
70 if chunks.is_empty() {
72 chunks = fallback_line_chunks(path, source, lang);
73 }
74 }
75
76 Ok(ParseResult {
77 chunks,
78 language: lang.name().to_string(),
79 })
80}
81
82fn extract_chunks_recursive(
84 node: &Node,
85 source: &str,
86 file_path: &Path,
87 lang: SupportedLanguage,
88 chunk_kinds: &[&str],
89 chunks: &mut Vec<CodeChunk>,
90) {
91 let kind = node.kind();
92
93 if chunk_kinds.contains(&kind) {
94 if let Some(chunk) = node_to_chunk(node, source, file_path, lang) {
95 let line_count = chunk.line_range.end - chunk.line_range.start;
97 if line_count >= MIN_CHUNK_LINES {
98 chunks.push(chunk);
99 }
100 }
101 if should_recurse_into(kind) {
105 let mut cursor = node.walk();
106 if cursor.goto_first_child() {
107 loop {
108 let child = cursor.node();
109 extract_chunks_recursive(
110 &child, source, file_path, lang, chunk_kinds, chunks,
111 );
112 if !cursor.goto_next_sibling() {
113 break;
114 }
115 }
116 }
117 }
118 } else {
119 let mut cursor = node.walk();
121 if cursor.goto_first_child() {
122 loop {
123 let child = cursor.node();
124 extract_chunks_recursive(
125 &child, source, file_path, lang, chunk_kinds, chunks,
126 );
127 if !cursor.goto_next_sibling() {
128 break;
129 }
130 }
131 }
132 }
133}
134
135fn should_recurse_into(kind: &str) -> bool {
137 matches!(
138 kind,
139 "impl_item"
140 | "class_declaration"
141 | "class_definition"
142 | "class_specifier"
143 | "interface_declaration"
144 | "namespace_definition"
145 | "module"
146 | "mod_item"
147 | "export_statement"
148 | "decorated_definition"
149 )
150}
151
152fn node_to_chunk(
154 node: &Node,
155 source: &str,
156 file_path: &Path,
157 lang: SupportedLanguage,
158) -> Option<CodeChunk> {
159 let start_byte = node.start_byte();
160 let end_byte = node.end_byte();
161
162 if end_byte <= start_byte || end_byte > source.len() {
163 return None;
164 }
165
166 let body = source[start_byte..end_byte].to_string();
167 let start_line = node.start_position().row;
168 let end_line = node.end_position().row + 1; let kind = classify_node_kind(node.kind(), lang);
171 let name = extract_node_name(node, source);
172 let signature = extract_signature(node, source, lang);
173 let doc_comment = extract_doc_comment(node, source, start_line);
174
175 Some(CodeChunk {
176 id: next_chunk_id(),
177 file_path: file_path.to_path_buf(),
178 language: lang.name().to_string(),
179 kind,
180 name,
181 signature,
182 doc_comment,
183 body,
184 byte_range: start_byte..end_byte,
185 line_range: start_line..end_line,
186 })
187}
188
189fn classify_node_kind(ts_kind: &str, _lang: SupportedLanguage) -> ChunkKind {
191 match ts_kind {
192 "function_item" | "function_definition" | "function_declaration" | "arrow_function" => {
194 ChunkKind::Function
195 }
196 "method_definition" | "method_declaration" | "method" | "singleton_method"
198 | "constructor_declaration" => ChunkKind::Method,
199 "class_declaration" | "class_definition" | "class_specifier" => ChunkKind::Class,
201 "struct_item" | "struct_specifier" => ChunkKind::Struct,
203 "enum_item" | "enum_declaration" | "enum_specifier" => ChunkKind::Enum,
205 "interface_declaration" | "trait_item" => ChunkKind::Interface,
207 "mod_item" | "namespace_definition" | "module" => ChunkKind::Module,
209 "impl_item" => ChunkKind::Module,
211 _ => ChunkKind::Block,
213 }
214}
215
216fn extract_node_name(node: &Node, source: &str) -> Option<String> {
218 for field_name in &["name", "declarator"] {
220 if let Some(name_node) = node.child_by_field_name(field_name) {
221 let name = &source[name_node.start_byte()..name_node.end_byte()];
222 return Some(name.to_string());
223 }
224 }
225
226 let mut cursor = node.walk();
228 if cursor.goto_first_child() {
229 loop {
230 let child = cursor.node();
231 if child.kind() == "identifier" || child.kind() == "type_identifier" {
232 let name = &source[child.start_byte()..child.end_byte()];
233 return Some(name.to_string());
234 }
235 if !cursor.goto_next_sibling() {
236 break;
237 }
238 }
239 }
240
241 None
242}
243
244fn extract_signature(node: &Node, source: &str, _lang: SupportedLanguage) -> Option<String> {
246 let body = &source[node.start_byte()..node.end_byte()];
247
248 if let Some(pos) = body.find('{') {
250 let sig = body[..pos].trim();
251 if !sig.is_empty() {
252 return Some(sig.to_string());
253 }
254 }
255
256 if let Some(pos) = body.find(':') {
258 let before_colon = &body[..pos];
260 if before_colon.contains("def ") || before_colon.contains("class ") {
261 let sig = body[..=pos].trim();
262 if !sig.is_empty() {
263 return Some(sig.to_string());
264 }
265 }
266 }
267
268 let first_line = body.lines().next().map(|l| l.trim().to_string());
270 first_line.filter(|l| !l.is_empty())
271}
272
273fn extract_doc_comment(
275 node: &Node,
276 source: &str,
277 _node_start_line: usize,
278) -> Option<String> {
279 let mut prev = node.prev_sibling();
281 let mut comments = Vec::new();
282
283 while let Some(sibling) = prev {
284 let kind = sibling.kind();
285 if kind == "line_comment" || kind == "comment" || kind == "block_comment" {
286 let text = &source[sibling.start_byte()..sibling.end_byte()];
287 comments.push(text.to_string());
288 prev = sibling.prev_sibling();
289 } else {
290 break;
291 }
292 }
293
294 if comments.is_empty() {
295 return None;
296 }
297
298 comments.reverse();
300 let combined = comments.join("\n");
301
302 let cleaned: String = combined
304 .lines()
305 .map(|line| {
306 let trimmed = line.trim();
307 if let Some(stripped) = trimmed.strip_prefix("///") {
308 stripped.trim().to_string()
309 } else if let Some(stripped) = trimmed.strip_prefix("//!") {
310 stripped.trim().to_string()
311 } else if let Some(stripped) = trimmed.strip_prefix("//") {
312 stripped.trim().to_string()
313 } else if let Some(stripped) = trimmed.strip_prefix('#') {
314 stripped.trim().to_string()
315 } else {
316 trimmed.to_string()
317 }
318 })
319 .collect::<Vec<_>>()
320 .join("\n");
321
322 if cleaned.trim().is_empty() {
323 None
324 } else {
325 Some(cleaned)
326 }
327}
328
329fn fallback_line_chunks(
331 file_path: &Path,
332 source: &str,
333 lang: SupportedLanguage,
334) -> Vec<CodeChunk> {
335 let lines: Vec<&str> = source.lines().collect();
336 let total_lines = lines.len();
337
338 if total_lines == 0 {
339 return Vec::new();
340 }
341
342 let mut chunks = Vec::new();
343 let mut offset = 0;
344
345 for chunk_start in (0..total_lines).step_by(FALLBACK_CHUNK_SIZE) {
346 let chunk_end = (chunk_start + FALLBACK_CHUNK_SIZE).min(total_lines);
347 let chunk_lines = &lines[chunk_start..chunk_end];
348 let body = chunk_lines.join("\n");
349 let byte_start = offset;
350 let byte_end = offset + body.len();
351 offset = byte_end + 1; chunks.push(CodeChunk {
354 id: next_chunk_id(),
355 file_path: file_path.to_path_buf(),
356 language: lang.name().to_string(),
357 kind: ChunkKind::Block,
358 name: Some(format!(
359 "{}:L{}-L{}",
360 file_path.file_name().unwrap_or_default().to_string_lossy(),
361 chunk_start + 1,
362 chunk_end
363 )),
364 signature: None,
365 doc_comment: None,
366 body,
367 byte_range: byte_start..byte_end,
368 line_range: chunk_start..chunk_end,
369 });
370 }
371
372 chunks
373}
374
375pub fn chunk_file_from_path(path: &Path) -> Result<Option<ParseResult>, ParserError> {
377 let lang = match SupportedLanguage::from_path(path) {
378 Some(l) => l,
379 None => return Ok(None), };
381
382 let source = std::fs::read_to_string(path).map_err(ParserError::Io)?;
383
384 if crate::scanner::filter::is_binary_content(source.as_bytes()) {
386 return Ok(None);
387 }
388
389 let result = chunk_file(path, &source, lang)?;
390 Ok(Some(result))
391}
392
393#[cfg(test)]
394mod tests {
395 use super::*;
396
397 #[test]
398 fn test_chunk_rust_file() {
399 let source = r#"
400/// A greeting function.
401fn greet(name: &str) -> String {
402 format!("Hello, {}!", name)
403}
404
405/// A struct.
406struct User {
407 name: String,
408 age: u32,
409}
410
411impl User {
412 fn new(name: String, age: u32) -> Self {
413 Self { name, age }
414 }
415
416 fn display(&self) -> String {
417 format!("{} ({})", self.name, self.age)
418 }
419}
420"#;
421 let result = chunk_file(
422 Path::new("test.rs"),
423 source,
424 SupportedLanguage::Rust,
425 )
426 .unwrap();
427
428 assert_eq!(result.language, "rust");
429 assert!(!result.chunks.is_empty(), "Should find at least some chunks");
430
431 let greet = result.chunks.iter().find(|c| c.name.as_deref() == Some("greet"));
433 assert!(greet.is_some(), "Should find greet function");
434
435 let greet = greet.unwrap();
436 assert_eq!(greet.kind, ChunkKind::Function);
437 assert!(greet.doc_comment.is_some(), "Should extract doc comment");
438 assert!(greet.signature.is_some(), "Should extract signature");
439 }
440
441 #[test]
442 fn test_chunk_python_file() {
443 let source = r#"
444class Calculator:
445 """A simple calculator."""
446
447 def add(self, a: int, b: int) -> int:
448 """Add two numbers."""
449 return a + b
450
451 def subtract(self, a: int, b: int) -> int:
452 return a - b
453
454def standalone_function(x: str) -> bool:
455 return len(x) > 0
456"#;
457 let result = chunk_file(
458 Path::new("calc.py"),
459 source,
460 SupportedLanguage::Python,
461 )
462 .unwrap();
463
464 assert_eq!(result.language, "python");
465 assert!(!result.chunks.is_empty());
466 }
467
468 #[test]
469 fn test_chunk_javascript_file() {
470 let source = r#"
471function fetchData(url) {
472 return fetch(url).then(r => r.json());
473}
474
475class EventEmitter {
476 constructor() {
477 this.listeners = {};
478 }
479
480 on(event, callback) {
481 this.listeners[event] = callback;
482 }
483}
484"#;
485 let result = chunk_file(
486 Path::new("app.js"),
487 source,
488 SupportedLanguage::JavaScript,
489 )
490 .unwrap();
491
492 assert_eq!(result.language, "javascript");
493 assert!(!result.chunks.is_empty());
494 }
495
496 #[test]
497 fn test_fallback_chunking() {
498 let source = "#!/bin/bash\necho 'hello'\necho 'world'\n".repeat(30);
500 let result = chunk_file(
501 Path::new("script.sh"),
502 &source,
503 SupportedLanguage::Bash,
504 )
505 .unwrap();
506
507 assert!(!result.chunks.is_empty());
509 }
510}