Skip to main content

seekr_code/parser/
chunker.rs

1//! Semantic code chunker.
2//!
3//! Traverses AST nodes to extract semantic code chunks (functions, classes,
4//! methods, structs, etc.) from parsed source files.
5
6use std::path::Path;
7use std::sync::atomic::{AtomicU64, Ordering};
8
9use tree_sitter::Node;
10
11use crate::error::ParserError;
12use crate::parser::treesitter::SupportedLanguage;
13use crate::parser::{ChunkKind, CodeChunk, ParseResult};
14
15/// Global chunk ID counter (monotonically increasing).
16static CHUNK_ID_COUNTER: AtomicU64 = AtomicU64::new(1);
17
18/// Generate a new unique chunk ID.
19fn next_chunk_id() -> u64 {
20    CHUNK_ID_COUNTER.fetch_add(1, Ordering::Relaxed)
21}
22
23/// Minimum number of lines for a chunk to be considered meaningful.
24const MIN_CHUNK_LINES: usize = 2;
25
26/// Maximum number of lines for fallback line-based chunking.
27const FALLBACK_CHUNK_SIZE: usize = 50;
28
29/// Parse a source file and extract code chunks.
30pub fn chunk_file(
31    path: &Path,
32    source: &str,
33    lang: SupportedLanguage,
34) -> Result<ParseResult, ParserError> {
35    let tree = crate::parser::treesitter::parse_source(source, lang)?;
36    let root = tree.root_node();
37
38    let chunk_kinds = lang.chunk_node_kinds();
39
40    let mut chunks = Vec::new();
41
42    if chunk_kinds.is_empty() {
43        // For non-code languages (JSON, TOML, etc.), create a single chunk
44        // for the entire file if it's not too large
45        if source.lines().count() <= FALLBACK_CHUNK_SIZE * 2 {
46            chunks.push(CodeChunk {
47                id: next_chunk_id(),
48                file_path: path.to_path_buf(),
49                language: lang.name().to_string(),
50                kind: ChunkKind::Block,
51                name: path.file_name().and_then(|f| f.to_str()).map(String::from),
52                signature: None,
53                doc_comment: None,
54                body: source.to_string(),
55                byte_range: 0..source.len(),
56                line_range: 0..source.lines().count(),
57            });
58        }
59    } else {
60        // Walk the AST and extract chunks for matching node kinds
61        extract_chunks_recursive(&root, source, path, lang, chunk_kinds, &mut chunks);
62
63        // If no chunks were found via AST, fall back to line-based chunking
64        if chunks.is_empty() {
65            chunks = fallback_line_chunks(path, source, lang);
66        }
67    }
68
69    Ok(ParseResult {
70        chunks,
71        language: lang.name().to_string(),
72    })
73}
74
75/// Recursively walk the AST and extract chunks for matching node kinds.
76fn extract_chunks_recursive(
77    node: &Node,
78    source: &str,
79    file_path: &Path,
80    lang: SupportedLanguage,
81    chunk_kinds: &[&str],
82    chunks: &mut Vec<CodeChunk>,
83) {
84    let kind = node.kind();
85
86    if chunk_kinds.contains(&kind) {
87        if let Some(chunk) = node_to_chunk(node, source, file_path, lang) {
88            // Only add chunks that are meaningful (not too small)
89            let line_count = chunk.line_range.end - chunk.line_range.start;
90            if line_count >= MIN_CHUNK_LINES {
91                chunks.push(chunk);
92            }
93        }
94        // Don't recurse into matched nodes to avoid nested duplicates
95        // (e.g., methods inside a class that's already extracted)
96        // We DO want nested chunks for impl/class blocks though
97        if should_recurse_into(kind) {
98            let mut cursor = node.walk();
99            if cursor.goto_first_child() {
100                loop {
101                    let child = cursor.node();
102                    extract_chunks_recursive(&child, source, file_path, lang, chunk_kinds, chunks);
103                    if !cursor.goto_next_sibling() {
104                        break;
105                    }
106                }
107            }
108        }
109    } else {
110        // Continue searching in children
111        let mut cursor = node.walk();
112        if cursor.goto_first_child() {
113            loop {
114                let child = cursor.node();
115                extract_chunks_recursive(&child, source, file_path, lang, chunk_kinds, chunks);
116                if !cursor.goto_next_sibling() {
117                    break;
118                }
119            }
120        }
121    }
122}
123
124/// Determine if we should recurse into a matched node to find nested chunks.
125fn should_recurse_into(kind: &str) -> bool {
126    matches!(
127        kind,
128        "impl_item"
129            | "class_declaration"
130            | "class_definition"
131            | "class_specifier"
132            | "interface_declaration"
133            | "namespace_definition"
134            | "module"
135            | "mod_item"
136            | "export_statement"
137            | "decorated_definition"
138    )
139}
140
141/// Convert a tree-sitter Node to a CodeChunk.
142fn node_to_chunk(
143    node: &Node,
144    source: &str,
145    file_path: &Path,
146    lang: SupportedLanguage,
147) -> Option<CodeChunk> {
148    let start_byte = node.start_byte();
149    let end_byte = node.end_byte();
150
151    if end_byte <= start_byte || end_byte > source.len() {
152        return None;
153    }
154
155    let body = source[start_byte..end_byte].to_string();
156    let start_line = node.start_position().row;
157    let end_line = node.end_position().row + 1; // exclusive
158
159    let kind = classify_node_kind(node.kind(), lang);
160    let name = extract_node_name(node, source);
161    let signature = extract_signature(node, source, lang);
162    let doc_comment = extract_doc_comment(node, source, start_line);
163
164    Some(CodeChunk {
165        id: next_chunk_id(),
166        file_path: file_path.to_path_buf(),
167        language: lang.name().to_string(),
168        kind,
169        name,
170        signature,
171        doc_comment,
172        body,
173        byte_range: start_byte..end_byte,
174        line_range: start_line..end_line,
175    })
176}
177
178/// Classify a tree-sitter node kind into a ChunkKind.
179fn classify_node_kind(ts_kind: &str, _lang: SupportedLanguage) -> ChunkKind {
180    match ts_kind {
181        // Functions
182        "function_item" | "function_definition" | "function_declaration" | "arrow_function" => {
183            ChunkKind::Function
184        }
185        // Methods
186        "method_definition"
187        | "method_declaration"
188        | "method"
189        | "singleton_method"
190        | "constructor_declaration" => ChunkKind::Method,
191        // Classes
192        "class_declaration" | "class_definition" | "class_specifier" => ChunkKind::Class,
193        // Structs
194        "struct_item" | "struct_specifier" => ChunkKind::Struct,
195        // Enums
196        "enum_item" | "enum_declaration" | "enum_specifier" => ChunkKind::Enum,
197        // Interfaces / Traits
198        "interface_declaration" | "trait_item" => ChunkKind::Interface,
199        // Modules / Namespaces
200        "mod_item" | "namespace_definition" | "module" => ChunkKind::Module,
201        // Impl blocks (Rust) → treat as Module-level grouping
202        "impl_item" => ChunkKind::Module,
203        // Everything else
204        _ => ChunkKind::Block,
205    }
206}
207
208/// Extract the name of a node (e.g., function name, class name).
209fn extract_node_name(node: &Node, source: &str) -> Option<String> {
210    // Try common field names for the "name" of a construct
211    for field_name in &["name", "declarator"] {
212        if let Some(name_node) = node.child_by_field_name(field_name) {
213            let name = &source[name_node.start_byte()..name_node.end_byte()];
214            return Some(name.to_string());
215        }
216    }
217
218    // For some languages, look at the first named child of specific type
219    let mut cursor = node.walk();
220    if cursor.goto_first_child() {
221        loop {
222            let child = cursor.node();
223            if child.kind() == "identifier" || child.kind() == "type_identifier" {
224                let name = &source[child.start_byte()..child.end_byte()];
225                return Some(name.to_string());
226            }
227            if !cursor.goto_next_sibling() {
228                break;
229            }
230        }
231    }
232
233    None
234}
235
236/// Extract a function/method signature (first line or up to the body).
237fn extract_signature(node: &Node, source: &str, _lang: SupportedLanguage) -> Option<String> {
238    let body = &source[node.start_byte()..node.end_byte()];
239
240    // Find the first `{` or `:` (Python) to extract just the signature
241    if let Some(pos) = body.find('{') {
242        let sig = body[..pos].trim();
243        if !sig.is_empty() {
244            return Some(sig.to_string());
245        }
246    }
247
248    // For Python-style (colon-based blocks)
249    if let Some(pos) = body.find(':') {
250        // Make sure it's a function/class colon, not a type annotation colon
251        let before_colon = &body[..pos];
252        if before_colon.contains("def ") || before_colon.contains("class ") {
253            let sig = body[..=pos].trim();
254            if !sig.is_empty() {
255                return Some(sig.to_string());
256            }
257        }
258    }
259
260    // Fallback: first line
261    let first_line = body.lines().next().map(|l| l.trim().to_string());
262    first_line.filter(|l| !l.is_empty())
263}
264
265/// Extract documentation comment immediately before a node.
266fn extract_doc_comment(node: &Node, source: &str, _node_start_line: usize) -> Option<String> {
267    // Look at previous siblings for comment nodes
268    let mut prev = node.prev_sibling();
269    let mut comments = Vec::new();
270
271    while let Some(sibling) = prev {
272        let kind = sibling.kind();
273        if kind == "line_comment" || kind == "comment" || kind == "block_comment" {
274            let text = &source[sibling.start_byte()..sibling.end_byte()];
275            comments.push(text.to_string());
276            prev = sibling.prev_sibling();
277        } else {
278            break;
279        }
280    }
281
282    if comments.is_empty() {
283        return None;
284    }
285
286    // Reverse since we collected them backwards
287    comments.reverse();
288    let combined = comments.join("\n");
289
290    // Clean up common comment prefixes
291    let cleaned: String = combined
292        .lines()
293        .map(|line| {
294            let trimmed = line.trim();
295            if let Some(stripped) = trimmed.strip_prefix("///") {
296                stripped.trim().to_string()
297            } else if let Some(stripped) = trimmed.strip_prefix("//!") {
298                stripped.trim().to_string()
299            } else if let Some(stripped) = trimmed.strip_prefix("//") {
300                stripped.trim().to_string()
301            } else if let Some(stripped) = trimmed.strip_prefix('#') {
302                stripped.trim().to_string()
303            } else {
304                trimmed.to_string()
305            }
306        })
307        .collect::<Vec<_>>()
308        .join("\n");
309
310    if cleaned.trim().is_empty() {
311        None
312    } else {
313        Some(cleaned)
314    }
315}
316
317/// Fallback: split source into line-based chunks when AST chunking yields nothing.
318fn fallback_line_chunks(file_path: &Path, source: &str, lang: SupportedLanguage) -> Vec<CodeChunk> {
319    let lines: Vec<&str> = source.lines().collect();
320    let total_lines = lines.len();
321
322    if total_lines == 0 {
323        return Vec::new();
324    }
325
326    let mut chunks = Vec::new();
327    let mut offset = 0;
328
329    for chunk_start in (0..total_lines).step_by(FALLBACK_CHUNK_SIZE) {
330        let chunk_end = (chunk_start + FALLBACK_CHUNK_SIZE).min(total_lines);
331        let chunk_lines = &lines[chunk_start..chunk_end];
332        let body = chunk_lines.join("\n");
333        let byte_start = offset;
334        let byte_end = offset + body.len();
335        offset = byte_end + 1; // +1 for the newline between chunks
336
337        chunks.push(CodeChunk {
338            id: next_chunk_id(),
339            file_path: file_path.to_path_buf(),
340            language: lang.name().to_string(),
341            kind: ChunkKind::Block,
342            name: Some(format!(
343                "{}:L{}-L{}",
344                file_path.file_name().unwrap_or_default().to_string_lossy(),
345                chunk_start + 1,
346                chunk_end
347            )),
348            signature: None,
349            doc_comment: None,
350            body,
351            byte_range: byte_start..byte_end,
352            line_range: chunk_start..chunk_end,
353        });
354    }
355
356    chunks
357}
358
359/// Parse a file from disk: read it, detect language, and chunk it.
360pub fn chunk_file_from_path(path: &Path) -> Result<Option<ParseResult>, ParserError> {
361    let lang = match SupportedLanguage::from_path(path) {
362        Some(l) => l,
363        None => return Ok(None), // Unsupported language, skip
364    };
365
366    let source = std::fs::read_to_string(path).map_err(ParserError::Io)?;
367
368    // Skip binary files
369    if crate::scanner::filter::is_binary_content(source.as_bytes()) {
370        return Ok(None);
371    }
372
373    let result = chunk_file(path, &source, lang)?;
374    Ok(Some(result))
375}
376
377#[cfg(test)]
378mod tests {
379    use super::*;
380
381    #[test]
382    fn test_chunk_rust_file() {
383        let source = r#"
384/// A greeting function.
385fn greet(name: &str) -> String {
386    format!("Hello, {}!", name)
387}
388
389/// A struct.
390struct User {
391    name: String,
392    age: u32,
393}
394
395impl User {
396    fn new(name: String, age: u32) -> Self {
397        Self { name, age }
398    }
399
400    fn display(&self) -> String {
401        format!("{} ({})", self.name, self.age)
402    }
403}
404"#;
405        let result = chunk_file(Path::new("test.rs"), source, SupportedLanguage::Rust).unwrap();
406
407        assert_eq!(result.language, "rust");
408        assert!(
409            !result.chunks.is_empty(),
410            "Should find at least some chunks"
411        );
412
413        // Should find the greet function
414        let greet = result
415            .chunks
416            .iter()
417            .find(|c| c.name.as_deref() == Some("greet"));
418        assert!(greet.is_some(), "Should find greet function");
419
420        let greet = greet.unwrap();
421        assert_eq!(greet.kind, ChunkKind::Function);
422        assert!(greet.doc_comment.is_some(), "Should extract doc comment");
423        assert!(greet.signature.is_some(), "Should extract signature");
424    }
425
426    #[test]
427    fn test_chunk_python_file() {
428        let source = r#"
429class Calculator:
430    """A simple calculator."""
431
432    def add(self, a: int, b: int) -> int:
433        """Add two numbers."""
434        return a + b
435
436    def subtract(self, a: int, b: int) -> int:
437        return a - b
438
439def standalone_function(x: str) -> bool:
440    return len(x) > 0
441"#;
442        let result = chunk_file(Path::new("calc.py"), source, SupportedLanguage::Python).unwrap();
443
444        assert_eq!(result.language, "python");
445        assert!(!result.chunks.is_empty());
446    }
447
448    #[test]
449    fn test_chunk_javascript_file() {
450        let source = r#"
451function fetchData(url) {
452    return fetch(url).then(r => r.json());
453}
454
455class EventEmitter {
456    constructor() {
457        this.listeners = {};
458    }
459
460    on(event, callback) {
461        this.listeners[event] = callback;
462    }
463}
464"#;
465        let result =
466            chunk_file(Path::new("app.js"), source, SupportedLanguage::JavaScript).unwrap();
467
468        assert_eq!(result.language, "javascript");
469        assert!(!result.chunks.is_empty());
470    }
471
472    #[test]
473    fn test_fallback_chunking() {
474        // Create a file with no recognizable AST patterns
475        let source = "#!/bin/bash\necho 'hello'\necho 'world'\n".repeat(30);
476        let result = chunk_file(Path::new("script.sh"), &source, SupportedLanguage::Bash).unwrap();
477
478        // Bash has empty chunk_node_kinds, so should get a single block chunk
479        assert!(!result.chunks.is_empty());
480    }
481}