Skip to main content

seekr_code/parser/
chunker.rs

1//! Semantic code chunker.
2//!
3//! Traverses AST nodes to extract semantic code chunks (functions, classes,
4//! methods, structs, etc.) from parsed source files.
5
6use std::path::Path;
7use std::sync::atomic::{AtomicU64, Ordering};
8
9use tree_sitter::Node;
10
11use crate::error::ParserError;
12use crate::parser::treesitter::SupportedLanguage;
13use crate::parser::{ChunkKind, CodeChunk, ParseResult};
14
15/// Global chunk ID counter (monotonically increasing).
16static CHUNK_ID_COUNTER: AtomicU64 = AtomicU64::new(1);
17
18/// Generate a new unique chunk ID.
19fn next_chunk_id() -> u64 {
20    CHUNK_ID_COUNTER.fetch_add(1, Ordering::Relaxed)
21}
22
23/// Minimum number of lines for a chunk to be considered meaningful.
24const MIN_CHUNK_LINES: usize = 2;
25
26/// Maximum number of lines for fallback line-based chunking.
27const FALLBACK_CHUNK_SIZE: usize = 50;
28
29/// Parse a source file and extract code chunks.
30pub fn chunk_file(
31    path: &Path,
32    source: &str,
33    lang: SupportedLanguage,
34) -> Result<ParseResult, ParserError> {
35    let tree = crate::parser::treesitter::parse_source(source, lang)?;
36    let root = tree.root_node();
37
38    let chunk_kinds = lang.chunk_node_kinds();
39
40    let mut chunks = Vec::new();
41
42    if chunk_kinds.is_empty() {
43        // For non-code languages (JSON, TOML, etc.), create a single chunk
44        // for the entire file if it's not too large
45        if source.lines().count() <= FALLBACK_CHUNK_SIZE * 2 {
46            chunks.push(CodeChunk {
47                id: next_chunk_id(),
48                file_path: path.to_path_buf(),
49                language: lang.name().to_string(),
50                kind: ChunkKind::Block,
51                name: path.file_name().and_then(|f| f.to_str()).map(String::from),
52                signature: None,
53                doc_comment: None,
54                body: source.to_string(),
55                byte_range: 0..source.len(),
56                line_range: 0..source.lines().count(),
57            });
58        }
59    } else {
60        // Walk the AST and extract chunks for matching node kinds
61        extract_chunks_recursive(
62            &root,
63            source,
64            path,
65            lang,
66            chunk_kinds,
67            &mut chunks,
68        );
69
70        // If no chunks were found via AST, fall back to line-based chunking
71        if chunks.is_empty() {
72            chunks = fallback_line_chunks(path, source, lang);
73        }
74    }
75
76    Ok(ParseResult {
77        chunks,
78        language: lang.name().to_string(),
79    })
80}
81
82/// Recursively walk the AST and extract chunks for matching node kinds.
83fn extract_chunks_recursive(
84    node: &Node,
85    source: &str,
86    file_path: &Path,
87    lang: SupportedLanguage,
88    chunk_kinds: &[&str],
89    chunks: &mut Vec<CodeChunk>,
90) {
91    let kind = node.kind();
92
93    if chunk_kinds.contains(&kind) {
94        if let Some(chunk) = node_to_chunk(node, source, file_path, lang) {
95            // Only add chunks that are meaningful (not too small)
96            let line_count = chunk.line_range.end - chunk.line_range.start;
97            if line_count >= MIN_CHUNK_LINES {
98                chunks.push(chunk);
99            }
100        }
101        // Don't recurse into matched nodes to avoid nested duplicates
102        // (e.g., methods inside a class that's already extracted)
103        // We DO want nested chunks for impl/class blocks though
104        if should_recurse_into(kind) {
105            let mut cursor = node.walk();
106            if cursor.goto_first_child() {
107                loop {
108                    let child = cursor.node();
109                    extract_chunks_recursive(
110                        &child, source, file_path, lang, chunk_kinds, chunks,
111                    );
112                    if !cursor.goto_next_sibling() {
113                        break;
114                    }
115                }
116            }
117        }
118    } else {
119        // Continue searching in children
120        let mut cursor = node.walk();
121        if cursor.goto_first_child() {
122            loop {
123                let child = cursor.node();
124                extract_chunks_recursive(
125                    &child, source, file_path, lang, chunk_kinds, chunks,
126                );
127                if !cursor.goto_next_sibling() {
128                    break;
129                }
130            }
131        }
132    }
133}
134
135/// Determine if we should recurse into a matched node to find nested chunks.
136fn should_recurse_into(kind: &str) -> bool {
137    matches!(
138        kind,
139        "impl_item"
140            | "class_declaration"
141            | "class_definition"
142            | "class_specifier"
143            | "interface_declaration"
144            | "namespace_definition"
145            | "module"
146            | "mod_item"
147            | "export_statement"
148            | "decorated_definition"
149    )
150}
151
152/// Convert a tree-sitter Node to a CodeChunk.
153fn node_to_chunk(
154    node: &Node,
155    source: &str,
156    file_path: &Path,
157    lang: SupportedLanguage,
158) -> Option<CodeChunk> {
159    let start_byte = node.start_byte();
160    let end_byte = node.end_byte();
161
162    if end_byte <= start_byte || end_byte > source.len() {
163        return None;
164    }
165
166    let body = source[start_byte..end_byte].to_string();
167    let start_line = node.start_position().row;
168    let end_line = node.end_position().row + 1; // exclusive
169
170    let kind = classify_node_kind(node.kind(), lang);
171    let name = extract_node_name(node, source);
172    let signature = extract_signature(node, source, lang);
173    let doc_comment = extract_doc_comment(node, source, start_line);
174
175    Some(CodeChunk {
176        id: next_chunk_id(),
177        file_path: file_path.to_path_buf(),
178        language: lang.name().to_string(),
179        kind,
180        name,
181        signature,
182        doc_comment,
183        body,
184        byte_range: start_byte..end_byte,
185        line_range: start_line..end_line,
186    })
187}
188
189/// Classify a tree-sitter node kind into a ChunkKind.
190fn classify_node_kind(ts_kind: &str, _lang: SupportedLanguage) -> ChunkKind {
191    match ts_kind {
192        // Functions
193        "function_item" | "function_definition" | "function_declaration" | "arrow_function" => {
194            ChunkKind::Function
195        }
196        // Methods
197        "method_definition" | "method_declaration" | "method" | "singleton_method"
198        | "constructor_declaration" => ChunkKind::Method,
199        // Classes
200        "class_declaration" | "class_definition" | "class_specifier" => ChunkKind::Class,
201        // Structs
202        "struct_item" | "struct_specifier" => ChunkKind::Struct,
203        // Enums
204        "enum_item" | "enum_declaration" | "enum_specifier" => ChunkKind::Enum,
205        // Interfaces / Traits
206        "interface_declaration" | "trait_item" => ChunkKind::Interface,
207        // Modules / Namespaces
208        "mod_item" | "namespace_definition" | "module" => ChunkKind::Module,
209        // Impl blocks (Rust) → treat as Module-level grouping
210        "impl_item" => ChunkKind::Module,
211        // Everything else
212        _ => ChunkKind::Block,
213    }
214}
215
216/// Extract the name of a node (e.g., function name, class name).
217fn extract_node_name(node: &Node, source: &str) -> Option<String> {
218    // Try common field names for the "name" of a construct
219    for field_name in &["name", "declarator"] {
220        if let Some(name_node) = node.child_by_field_name(field_name) {
221            let name = &source[name_node.start_byte()..name_node.end_byte()];
222            return Some(name.to_string());
223        }
224    }
225
226    // For some languages, look at the first named child of specific type
227    let mut cursor = node.walk();
228    if cursor.goto_first_child() {
229        loop {
230            let child = cursor.node();
231            if child.kind() == "identifier" || child.kind() == "type_identifier" {
232                let name = &source[child.start_byte()..child.end_byte()];
233                return Some(name.to_string());
234            }
235            if !cursor.goto_next_sibling() {
236                break;
237            }
238        }
239    }
240
241    None
242}
243
244/// Extract a function/method signature (first line or up to the body).
245fn extract_signature(node: &Node, source: &str, _lang: SupportedLanguage) -> Option<String> {
246    let body = &source[node.start_byte()..node.end_byte()];
247
248    // Find the first `{` or `:` (Python) to extract just the signature
249    if let Some(pos) = body.find('{') {
250        let sig = body[..pos].trim();
251        if !sig.is_empty() {
252            return Some(sig.to_string());
253        }
254    }
255
256    // For Python-style (colon-based blocks)
257    if let Some(pos) = body.find(':') {
258        // Make sure it's a function/class colon, not a type annotation colon
259        let before_colon = &body[..pos];
260        if before_colon.contains("def ") || before_colon.contains("class ") {
261            let sig = body[..=pos].trim();
262            if !sig.is_empty() {
263                return Some(sig.to_string());
264            }
265        }
266    }
267
268    // Fallback: first line
269    let first_line = body.lines().next().map(|l| l.trim().to_string());
270    first_line.filter(|l| !l.is_empty())
271}
272
273/// Extract documentation comment immediately before a node.
274fn extract_doc_comment(
275    node: &Node,
276    source: &str,
277    _node_start_line: usize,
278) -> Option<String> {
279    // Look at previous siblings for comment nodes
280    let mut prev = node.prev_sibling();
281    let mut comments = Vec::new();
282
283    while let Some(sibling) = prev {
284        let kind = sibling.kind();
285        if kind == "line_comment" || kind == "comment" || kind == "block_comment" {
286            let text = &source[sibling.start_byte()..sibling.end_byte()];
287            comments.push(text.to_string());
288            prev = sibling.prev_sibling();
289        } else {
290            break;
291        }
292    }
293
294    if comments.is_empty() {
295        return None;
296    }
297
298    // Reverse since we collected them backwards
299    comments.reverse();
300    let combined = comments.join("\n");
301
302    // Clean up common comment prefixes
303    let cleaned: String = combined
304        .lines()
305        .map(|line| {
306            let trimmed = line.trim();
307            if let Some(stripped) = trimmed.strip_prefix("///") {
308                stripped.trim().to_string()
309            } else if let Some(stripped) = trimmed.strip_prefix("//!") {
310                stripped.trim().to_string()
311            } else if let Some(stripped) = trimmed.strip_prefix("//") {
312                stripped.trim().to_string()
313            } else if let Some(stripped) = trimmed.strip_prefix('#') {
314                stripped.trim().to_string()
315            } else {
316                trimmed.to_string()
317            }
318        })
319        .collect::<Vec<_>>()
320        .join("\n");
321
322    if cleaned.trim().is_empty() {
323        None
324    } else {
325        Some(cleaned)
326    }
327}
328
329/// Fallback: split source into line-based chunks when AST chunking yields nothing.
330fn fallback_line_chunks(
331    file_path: &Path,
332    source: &str,
333    lang: SupportedLanguage,
334) -> Vec<CodeChunk> {
335    let lines: Vec<&str> = source.lines().collect();
336    let total_lines = lines.len();
337
338    if total_lines == 0 {
339        return Vec::new();
340    }
341
342    let mut chunks = Vec::new();
343    let mut offset = 0;
344
345    for chunk_start in (0..total_lines).step_by(FALLBACK_CHUNK_SIZE) {
346        let chunk_end = (chunk_start + FALLBACK_CHUNK_SIZE).min(total_lines);
347        let chunk_lines = &lines[chunk_start..chunk_end];
348        let body = chunk_lines.join("\n");
349        let byte_start = offset;
350        let byte_end = offset + body.len();
351        offset = byte_end + 1; // +1 for the newline between chunks
352
353        chunks.push(CodeChunk {
354            id: next_chunk_id(),
355            file_path: file_path.to_path_buf(),
356            language: lang.name().to_string(),
357            kind: ChunkKind::Block,
358            name: Some(format!(
359                "{}:L{}-L{}",
360                file_path.file_name().unwrap_or_default().to_string_lossy(),
361                chunk_start + 1,
362                chunk_end
363            )),
364            signature: None,
365            doc_comment: None,
366            body,
367            byte_range: byte_start..byte_end,
368            line_range: chunk_start..chunk_end,
369        });
370    }
371
372    chunks
373}
374
375/// Parse a file from disk: read it, detect language, and chunk it.
376pub fn chunk_file_from_path(path: &Path) -> Result<Option<ParseResult>, ParserError> {
377    let lang = match SupportedLanguage::from_path(path) {
378        Some(l) => l,
379        None => return Ok(None), // Unsupported language, skip
380    };
381
382    let source = std::fs::read_to_string(path).map_err(ParserError::Io)?;
383
384    // Skip binary files
385    if crate::scanner::filter::is_binary_content(source.as_bytes()) {
386        return Ok(None);
387    }
388
389    let result = chunk_file(path, &source, lang)?;
390    Ok(Some(result))
391}
392
393#[cfg(test)]
394mod tests {
395    use super::*;
396
397    #[test]
398    fn test_chunk_rust_file() {
399        let source = r#"
400/// A greeting function.
401fn greet(name: &str) -> String {
402    format!("Hello, {}!", name)
403}
404
405/// A struct.
406struct User {
407    name: String,
408    age: u32,
409}
410
411impl User {
412    fn new(name: String, age: u32) -> Self {
413        Self { name, age }
414    }
415
416    fn display(&self) -> String {
417        format!("{} ({})", self.name, self.age)
418    }
419}
420"#;
421        let result = chunk_file(
422            Path::new("test.rs"),
423            source,
424            SupportedLanguage::Rust,
425        )
426        .unwrap();
427
428        assert_eq!(result.language, "rust");
429        assert!(!result.chunks.is_empty(), "Should find at least some chunks");
430
431        // Should find the greet function
432        let greet = result.chunks.iter().find(|c| c.name.as_deref() == Some("greet"));
433        assert!(greet.is_some(), "Should find greet function");
434
435        let greet = greet.unwrap();
436        assert_eq!(greet.kind, ChunkKind::Function);
437        assert!(greet.doc_comment.is_some(), "Should extract doc comment");
438        assert!(greet.signature.is_some(), "Should extract signature");
439    }
440
441    #[test]
442    fn test_chunk_python_file() {
443        let source = r#"
444class Calculator:
445    """A simple calculator."""
446
447    def add(self, a: int, b: int) -> int:
448        """Add two numbers."""
449        return a + b
450
451    def subtract(self, a: int, b: int) -> int:
452        return a - b
453
454def standalone_function(x: str) -> bool:
455    return len(x) > 0
456"#;
457        let result = chunk_file(
458            Path::new("calc.py"),
459            source,
460            SupportedLanguage::Python,
461        )
462        .unwrap();
463
464        assert_eq!(result.language, "python");
465        assert!(!result.chunks.is_empty());
466    }
467
468    #[test]
469    fn test_chunk_javascript_file() {
470        let source = r#"
471function fetchData(url) {
472    return fetch(url).then(r => r.json());
473}
474
475class EventEmitter {
476    constructor() {
477        this.listeners = {};
478    }
479
480    on(event, callback) {
481        this.listeners[event] = callback;
482    }
483}
484"#;
485        let result = chunk_file(
486            Path::new("app.js"),
487            source,
488            SupportedLanguage::JavaScript,
489        )
490        .unwrap();
491
492        assert_eq!(result.language, "javascript");
493        assert!(!result.chunks.is_empty());
494    }
495
496    #[test]
497    fn test_fallback_chunking() {
498        // Create a file with no recognizable AST patterns
499        let source = "#!/bin/bash\necho 'hello'\necho 'world'\n".repeat(30);
500        let result = chunk_file(
501            Path::new("script.sh"),
502            &source,
503            SupportedLanguage::Bash,
504        )
505        .unwrap();
506
507        // Bash has empty chunk_node_kinds, so should get a single block chunk
508        assert!(!result.chunks.is_empty());
509    }
510}