scribe_selection/
ast_parser.rs

1//! Tree-sitter based AST parsing for accurate code analysis
2//!
3//! This module replaces regex-based parsing with proper syntax-aware analysis
4//! using tree-sitter parsers for multiple programming languages.
5
6use scribe_core::tokenization::{utils as token_utils, TokenCounter};
7use scribe_core::{Result, ScribeError};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use tree_sitter::{Language, Node, Parser, Query, QueryCursor, Tree};
11
12/// Supported programming languages for AST parsing
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
14pub enum AstLanguage {
15    Python,
16    JavaScript,
17    TypeScript,
18    Go,
19    Rust,
20}
21
22impl AstLanguage {
23    /// Get the tree-sitter language for this language
24    pub fn tree_sitter_language(&self) -> Language {
25        match self {
26            AstLanguage::Python => tree_sitter_python::language(),
27            AstLanguage::JavaScript => tree_sitter_javascript::language(),
28            AstLanguage::TypeScript => tree_sitter_typescript::language_typescript(),
29            AstLanguage::Go => tree_sitter_go::language(),
30            AstLanguage::Rust => tree_sitter_rust::language(),
31        }
32    }
33
34    /// Detect language from file extension
35    pub fn from_extension(ext: &str) -> Option<Self> {
36        match ext.to_lowercase().as_str() {
37            "py" | "pyi" | "pyw" => Some(AstLanguage::Python),
38            "js" | "mjs" | "cjs" => Some(AstLanguage::JavaScript),
39            "ts" | "mts" | "cts" => Some(AstLanguage::TypeScript),
40            "go" => Some(AstLanguage::Go),
41            "rs" => Some(AstLanguage::Rust),
42            _ => None,
43        }
44    }
45}
46
47/// Import information extracted from AST
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct AstImport {
50    /// The module being imported
51    pub module: String,
52    /// Optional alias for the import
53    pub alias: Option<String>,
54    /// Specific items being imported (for from-imports)
55    pub items: Vec<String>,
56    /// Line number where the import appears
57    pub line_number: usize,
58    /// Whether this is a relative import
59    pub is_relative: bool,
60}
61
62/// A parsed code chunk with semantic information
63#[derive(Debug, Clone, Serialize, Deserialize)]
64pub struct AstChunk {
65    /// The text content of this chunk
66    pub content: String,
67    /// Type of the chunk (function, class, import, etc.)
68    pub chunk_type: String,
69    /// Start line (1-indexed)
70    pub start_line: usize,
71    /// End line (1-indexed)  
72    pub end_line: usize,
73    /// Start byte offset
74    pub start_byte: usize,
75    /// End byte offset
76    pub end_byte: usize,
77    /// Semantic importance score (0.0-1.0)
78    pub importance_score: f64,
79    /// Estimated token count
80    pub estimated_tokens: usize,
81    /// Dependencies (other chunks this depends on)
82    pub dependencies: Vec<String>,
83    /// Name/identifier of this chunk (if applicable)
84    pub name: Option<String>,
85    /// Whether this is publicly visible
86    pub is_public: bool,
87    /// Whether this has documentation
88    pub has_documentation: bool,
89}
90
91/// Extracted signature information
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct AstSignature {
94    /// The signature text
95    pub signature: String,
96    /// Type of signature (function, class, interface, etc.)
97    pub signature_type: String,
98    /// Name/identifier
99    pub name: String,
100    /// Parameters (for functions/methods)
101    pub parameters: Vec<String>,
102    /// Return type (if available)
103    pub return_type: Option<String>,
104    /// Whether this is public/exported
105    pub is_public: bool,
106    /// Line number
107    pub line: usize,
108}
109
110/// Tree-sitter based AST parser and analyzer
111pub struct AstParser {
112    parsers: HashMap<AstLanguage, Parser>,
113}
114
115impl AstParser {
116    /// Create a new AST parser with support for all languages
117    pub fn new() -> Result<Self> {
118        let mut parsers = HashMap::new();
119
120        for language in [
121            AstLanguage::Python,
122            AstLanguage::JavaScript,
123            AstLanguage::TypeScript,
124            AstLanguage::Go,
125            AstLanguage::Rust,
126        ] {
127            let mut parser = Parser::new();
128            parser
129                .set_language(language.tree_sitter_language())
130                .map_err(|e| {
131                    ScribeError::parse(format!("Failed to set tree-sitter language: {}", e))
132                })?;
133            parsers.insert(language, parser);
134        }
135
136        Ok(Self { parsers })
137    }
138
139    /// Parse code into chunks using tree-sitter AST
140    pub fn parse_chunks(&mut self, content: &str, file_path: &str) -> Result<Vec<AstChunk>> {
141        let language = self.detect_language(file_path)?;
142        let parser = self
143            .parsers
144            .get_mut(&language)
145            .ok_or_else(|| ScribeError::parse(format!("No parser for language: {:?}", language)))?;
146
147        let tree = parser
148            .parse(content, None)
149            .ok_or_else(|| ScribeError::parse("Failed to parse source code".to_string()))?;
150
151        let chunks = match language {
152            AstLanguage::Python => self.parse_python_chunks(content, &tree)?,
153            AstLanguage::JavaScript => self.parse_javascript_chunks(content, &tree)?,
154            AstLanguage::TypeScript => self.parse_typescript_chunks(content, &tree)?,
155            AstLanguage::Go => self.parse_go_chunks(content, &tree)?,
156            AstLanguage::Rust => self.parse_rust_chunks(content, &tree)?,
157        };
158
159        Ok(chunks)
160    }
161
162    /// Extract signatures using tree-sitter AST
163    /// Extract imports from the given content using optimized tree-sitter traversal
164    pub fn extract_imports(&self, content: &str, language: AstLanguage) -> Result<Vec<AstImport>> {
165        // Create a fresh parser for this operation to avoid mutable borrow issues
166        let mut parser = Parser::new();
167        parser
168            .set_language(language.tree_sitter_language())
169            .map_err(|e| ScribeError::parse(format!("Failed to set language: {}", e)))?;
170
171        let tree = parser
172            .parse(content, None)
173            .ok_or_else(|| ScribeError::parse("Failed to parse content"))?;
174
175        let mut imports = Vec::new();
176
177        // Use TreeCursor for efficient traversal
178        let mut cursor = tree.walk();
179        self.extract_imports_with_cursor(&mut cursor, content, language, &mut imports)?;
180
181        Ok(imports)
182    }
183
184    /// Extract imports using TreeCursor for optimal performance
185    fn extract_imports_with_cursor(
186        &self,
187        cursor: &mut tree_sitter::TreeCursor,
188        content: &str,
189        language: AstLanguage,
190        imports: &mut Vec<AstImport>,
191    ) -> Result<()> {
192        let node = cursor.node();
193
194        // Fast filter: skip nodes that can't contain imports
195        if !self.node_can_contain_imports(node.kind()) {
196            return Ok(());
197        }
198
199        // Process current node if it's an import
200        if self.is_import_node(node.kind()) {
201            self.extract_import_from_node(node, content, language, imports)?;
202        }
203
204        // Traverse children using cursor (much faster than child(i) loops)
205        if cursor.goto_first_child() {
206            loop {
207                self.extract_imports_with_cursor(cursor, content, language, imports)?;
208                if !cursor.goto_next_sibling() {
209                    break;
210                }
211            }
212            cursor.goto_parent();
213        }
214
215        Ok(())
216    }
217
218    /// Check if a node type can contain imports (fast filter)
219    fn node_can_contain_imports(&self, kind: &str) -> bool {
220        matches!(
221            kind,
222            "import_statement"
223                | "import_from_statement"
224                | "use_declaration"
225                | "import_declaration"
226                | "import_spec"
227                | "source_file"
228                | "module"
229                | "program"
230                | "translation_unit"
231                | "block"
232                | "statement_block"
233        ) || kind.contains("import")
234            || kind.contains("use")
235    }
236
237    /// Check if a node is an import statement
238    fn is_import_node(&self, kind: &str) -> bool {
239        matches!(
240            kind,
241            "import_statement"
242                | "import_from_statement"
243                | "use_declaration"
244                | "import_declaration"
245                | "import_spec"
246        )
247    }
248
249    /// Extract import from a specific node (no recursion needed)
250    fn extract_import_from_node(
251        &self,
252        node: Node,
253        content: &str,
254        language: AstLanguage,
255        imports: &mut Vec<AstImport>,
256    ) -> Result<()> {
257        match language {
258            AstLanguage::Python => {
259                self.extract_python_import_node(node, content, imports)?;
260            }
261            AstLanguage::JavaScript | AstLanguage::TypeScript => {
262                self.extract_js_ts_import_node(node, content, imports)?;
263            }
264            AstLanguage::Go => {
265                self.extract_go_import_node(node, content, imports)?;
266            }
267            AstLanguage::Rust => {
268                self.extract_rust_import_node(node, content, imports)?;
269            }
270        }
271        Ok(())
272    }
273
274    pub fn extract_signatures(
275        &mut self,
276        content: &str,
277        file_path: &str,
278    ) -> Result<Vec<AstSignature>> {
279        let language = self.detect_language(file_path)?;
280        let parser = self
281            .parsers
282            .get_mut(&language)
283            .ok_or_else(|| ScribeError::parse(format!("No parser for language: {:?}", language)))?;
284
285        let tree = parser
286            .parse(content, None)
287            .ok_or_else(|| ScribeError::parse("Failed to parse source code".to_string()))?;
288
289        let signatures = match language {
290            AstLanguage::Python => self.extract_python_signatures(content, &tree)?,
291            AstLanguage::JavaScript => self.extract_javascript_signatures(content, &tree)?,
292            AstLanguage::TypeScript => self.extract_typescript_signatures(content, &tree)?,
293            AstLanguage::Go => self.extract_go_signatures(content, &tree)?,
294            AstLanguage::Rust => self.extract_rust_signatures(content, &tree)?,
295        };
296
297        Ok(signatures)
298    }
299
300    /// Detect language from file path
301    fn detect_language(&self, file_path: &str) -> Result<AstLanguage> {
302        let extension = std::path::Path::new(file_path)
303            .extension()
304            .and_then(|ext| ext.to_str())
305            .unwrap_or("");
306
307        AstLanguage::from_extension(extension)
308            .ok_or_else(|| ScribeError::parse(format!("Unsupported file extension: {}", extension)))
309    }
310
311    /// Parse Python code chunks using tree-sitter
312    fn parse_python_chunks(&self, content: &str, tree: &Tree) -> Result<Vec<AstChunk>> {
313        let mut chunks = Vec::new();
314        let root_node = tree.root_node();
315
316        // Query for Python constructs
317        let query_str = r#"
318            (import_statement) @import
319            (import_from_statement) @import_from
320            (function_definition) @function
321            (class_definition) @class
322            (assignment 
323                left: (identifier) @const_name
324                right: (_) @const_value
325                (#match? @const_name "^[A-Z_][A-Z0-9_]*$")
326            ) @constant
327        "#;
328
329        let query = Query::new(AstLanguage::Python.tree_sitter_language(), query_str)
330            .map_err(|e| ScribeError::parse(format!("Invalid Python query: {}", e)))?;
331
332        let mut cursor = QueryCursor::new();
333        let captures = cursor.matches(&query, root_node, content.as_bytes());
334
335        for match_ in captures {
336            for capture in match_.captures {
337                let node = capture.node;
338                let chunk_type = &query.capture_names()[capture.index as usize];
339
340                let chunk =
341                    self.create_chunk_from_node(content, node, chunk_type, &AstLanguage::Python)?;
342                chunks.push(chunk);
343            }
344        }
345
346        // Sort by start position
347        chunks.sort_by_key(|c| c.start_byte);
348        Ok(chunks)
349    }
350
351    /// Parse JavaScript code chunks using tree-sitter
352    fn parse_javascript_chunks(&self, content: &str, tree: &Tree) -> Result<Vec<AstChunk>> {
353        let mut chunks = Vec::new();
354        let root_node = tree.root_node();
355
356        let query_str = r#"
357            (import_statement) @import
358            (export_statement) @export
359            (function_declaration) @function
360            (arrow_function) @arrow_function
361            (class_declaration) @class
362            (interface_declaration) @interface
363            (type_alias_declaration) @type_alias
364            (variable_declaration
365                declarations: (variable_declarator
366                    name: (identifier) @const_name
367                    value: (_) @const_value
368                ) @const_declarator
369                (#match? @const_name "^[A-Z_][A-Z0-9_]*$")
370            ) @constant
371        "#;
372
373        let query = Query::new(AstLanguage::JavaScript.tree_sitter_language(), query_str)
374            .map_err(|e| ScribeError::parse(format!("Invalid JavaScript query: {}", e)))?;
375
376        let mut cursor = QueryCursor::new();
377        let captures = cursor.matches(&query, root_node, content.as_bytes());
378
379        for match_ in captures {
380            for capture in match_.captures {
381                let node = capture.node;
382                let chunk_type = &query.capture_names()[capture.index as usize];
383
384                let chunk = self.create_chunk_from_node(
385                    content,
386                    node,
387                    chunk_type,
388                    &AstLanguage::JavaScript,
389                )?;
390                chunks.push(chunk);
391            }
392        }
393
394        chunks.sort_by_key(|c| c.start_byte);
395        Ok(chunks)
396    }
397
398    /// Parse TypeScript code chunks using tree-sitter
399    fn parse_typescript_chunks(&self, content: &str, tree: &Tree) -> Result<Vec<AstChunk>> {
400        let mut chunks = Vec::new();
401        let root_node = tree.root_node();
402
403        let query_str = r#"
404            (import_statement) @import
405            (export_statement) @export
406            (function_declaration) @function
407            (arrow_function) @arrow_function
408            (class_declaration) @class
409            (interface_declaration) @interface
410            (type_alias_declaration) @type_alias
411            (enum_declaration) @enum
412            (module_declaration) @module
413            (variable_declaration
414                declarations: (variable_declarator
415                    name: (identifier) @const_name
416                    value: (_) @const_value
417                ) @const_declarator
418                (#match? @const_name "^[A-Z_][A-Z0-9_]*$")
419            ) @constant
420        "#;
421
422        let query = Query::new(AstLanguage::TypeScript.tree_sitter_language(), query_str)
423            .map_err(|e| ScribeError::parse(format!("Invalid TypeScript query: {}", e)))?;
424
425        let mut cursor = QueryCursor::new();
426        let captures = cursor.matches(&query, root_node, content.as_bytes());
427
428        for match_ in captures {
429            for capture in match_.captures {
430                let node = capture.node;
431                let chunk_type = &query.capture_names()[capture.index as usize];
432
433                let chunk = self.create_chunk_from_node(
434                    content,
435                    node,
436                    chunk_type,
437                    &AstLanguage::TypeScript,
438                )?;
439                chunks.push(chunk);
440            }
441        }
442
443        chunks.sort_by_key(|c| c.start_byte);
444        Ok(chunks)
445    }
446
447    /// Parse Go code chunks using tree-sitter
448    fn parse_go_chunks(&self, content: &str, tree: &Tree) -> Result<Vec<AstChunk>> {
449        let mut chunks = Vec::new();
450        let root_node = tree.root_node();
451
452        let query_str = r#"
453            (package_clause) @package
454            (import_declaration) @import
455            (function_declaration) @function
456            (method_declaration) @method
457            (type_declaration) @type
458            (const_declaration) @const
459            (var_declaration) @var
460        "#;
461
462        let query = Query::new(AstLanguage::Go.tree_sitter_language(), query_str)
463            .map_err(|e| ScribeError::parse(format!("Invalid Go query: {}", e)))?;
464
465        let mut cursor = QueryCursor::new();
466        let captures = cursor.matches(&query, root_node, content.as_bytes());
467
468        for match_ in captures {
469            for capture in match_.captures {
470                let node = capture.node;
471                let chunk_type = &query.capture_names()[capture.index as usize];
472
473                let chunk =
474                    self.create_chunk_from_node(content, node, chunk_type, &AstLanguage::Go)?;
475                chunks.push(chunk);
476            }
477        }
478
479        chunks.sort_by_key(|c| c.start_byte);
480        Ok(chunks)
481    }
482
483    /// Parse Rust code chunks using tree-sitter
484    fn parse_rust_chunks(&self, content: &str, tree: &Tree) -> Result<Vec<AstChunk>> {
485        let mut chunks = Vec::new();
486        let root_node = tree.root_node();
487
488        let query_str = r#"
489            (use_declaration) @use
490            (mod_item) @mod
491            (struct_item) @struct
492            (enum_item) @enum
493            (trait_item) @trait
494            (impl_item) @impl
495            (function_item) @function
496            (const_item) @const
497            (static_item) @static
498            (type_item) @type_alias
499        "#;
500
501        let query = Query::new(AstLanguage::Rust.tree_sitter_language(), query_str)
502            .map_err(|e| ScribeError::parse(format!("Invalid Rust query: {}", e)))?;
503
504        let mut cursor = QueryCursor::new();
505        let captures = cursor.matches(&query, root_node, content.as_bytes());
506
507        for match_ in captures {
508            for capture in match_.captures {
509                let node = capture.node;
510                let chunk_type = &query.capture_names()[capture.index as usize];
511
512                let chunk =
513                    self.create_chunk_from_node(content, node, chunk_type, &AstLanguage::Rust)?;
514                chunks.push(chunk);
515            }
516        }
517
518        chunks.sort_by_key(|c| c.start_byte);
519        Ok(chunks)
520    }
521
522    /// Create a chunk from a tree-sitter node
523    fn create_chunk_from_node(
524        &self,
525        content: &str,
526        node: Node,
527        chunk_type: &str,
528        language: &AstLanguage,
529    ) -> Result<AstChunk> {
530        let start_byte = node.start_byte();
531        let end_byte = node.end_byte();
532        let start_position = node.start_position();
533        let end_position = node.end_position();
534
535        let chunk_content = &content[start_byte..end_byte];
536        let estimated_tokens = TokenCounter::global()
537            .count_tokens(chunk_content)
538            .unwrap_or_else(|_| token_utils::estimate_tokens_legacy(chunk_content));
539
540        // Calculate importance score based on chunk type and language
541        let importance_score = self.calculate_importance_score(chunk_type, language, node, content);
542
543        // Extract name if available
544        let name = self.extract_name_from_node(node, content);
545
546        // Check if public/exported
547        let is_public = self.is_node_public(node, content);
548
549        // Check for documentation
550        let has_documentation = self.has_documentation(node, content);
551
552        // Extract dependencies (simplified for now)
553        let dependencies = self.extract_dependencies(node, content);
554
555        Ok(AstChunk {
556            content: chunk_content.to_string(),
557            chunk_type: chunk_type.to_string(),
558            start_line: start_position.row + 1,
559            end_line: end_position.row + 1,
560            start_byte,
561            end_byte,
562            importance_score,
563            estimated_tokens,
564            dependencies,
565            name,
566            is_public,
567            has_documentation,
568        })
569    }
570
571    /// Calculate importance score based on AST analysis
572    fn calculate_importance_score(
573        &self,
574        chunk_type: &str,
575        language: &AstLanguage,
576        node: Node,
577        content: &str,
578    ) -> f64 {
579        let mut score: f64 = match chunk_type {
580            "import" | "import_from" | "use" => 0.9, // Imports are crucial
581            "package" => 0.95,                       // Package declarations are essential
582            "class" | "struct_item" | "trait_item" => 0.85, // Type definitions
583            "interface" | "type_alias" | "enum" => 0.8, // Type definitions
584            "function" | "method" => 0.75,           // Functions
585            "const" | "constant" | "static" => 0.6,  // Constants
586            "export" => 0.7,                         // Exports
587            "mod" | "module" => 0.65,                // Modules
588            _ => 0.5,                                // Default
589        };
590
591        // Boost score for public/exported items
592        if self.is_node_public(node, content) {
593            score += 0.1;
594        }
595
596        // Boost score for documented items
597        if self.has_documentation(node, content) {
598            score += 0.05;
599        }
600
601        // Language-specific adjustments
602        match language {
603            AstLanguage::Rust => {
604                // Rust impl blocks are very important
605                if chunk_type == "impl" {
606                    score = 0.85;
607                }
608            }
609            AstLanguage::TypeScript => {
610                // TypeScript interfaces are crucial
611                if chunk_type == "interface" {
612                    score = 0.9;
613                }
614            }
615            _ => {}
616        }
617
618        score.min(1.0)
619    }
620
621    /// Extract name/identifier from a node
622    fn extract_name_from_node(&self, node: Node, content: &str) -> Option<String> {
623        // Look for name field in node
624        for i in 0..node.child_count() {
625            if let Some(child) = node.child(i) {
626                if child.kind() == "identifier" || child.kind() == "type_identifier" {
627                    let name_bytes = &content.as_bytes()[child.start_byte()..child.end_byte()];
628                    if let Ok(name) = std::str::from_utf8(name_bytes) {
629                        return Some(name.to_string());
630                    }
631                }
632            }
633        }
634        None
635    }
636
637    /// Check if a node represents a public/exported item
638    fn is_node_public(&self, node: Node, content: &str) -> bool {
639        // Check for pub keyword in Rust
640        if let Some(parent) = node.parent() {
641            for i in 0..parent.child_count() {
642                if let Some(child) = parent.child(i) {
643                    if child.kind() == "visibility_modifier" {
644                        let vis_bytes = &content.as_bytes()[child.start_byte()..child.end_byte()];
645                        if let Ok(vis) = std::str::from_utf8(vis_bytes) {
646                            return vis.contains("pub");
647                        }
648                    }
649                }
650            }
651        }
652
653        // Check for export in JS/TS
654        let node_text = &content[node.start_byte()..node.end_byte()];
655        node_text.starts_with("export") || node_text.contains("export")
656    }
657
658    /// Check if a node has associated documentation
659    fn has_documentation(&self, node: Node, content: &str) -> bool {
660        // Look for comments before the node
661        if let Some(prev_sibling) = node.prev_sibling() {
662            if prev_sibling.kind() == "comment" {
663                return true;
664            }
665        }
666
667        // Look for docstrings in Python
668        if node.kind() == "function_definition" || node.kind() == "class_definition" {
669            for i in 0..node.child_count() {
670                if let Some(child) = node.child(i) {
671                    if child.kind() == "expression_statement" {
672                        if let Some(grandchild) = child.child(0) {
673                            if grandchild.kind() == "string" {
674                                let string_content =
675                                    &content[grandchild.start_byte()..grandchild.end_byte()];
676                                if string_content.starts_with("\"\"\"")
677                                    || string_content.starts_with("'''")
678                                {
679                                    return true;
680                                }
681                            }
682                        }
683                    }
684                }
685            }
686        }
687
688        false
689    }
690
691    /// Extract dependencies from a node (simplified implementation)
692    fn extract_dependencies(&self, node: Node, content: &str) -> Vec<String> {
693        let mut dependencies = Vec::new();
694
695        // For import nodes, extract the imported modules
696        if node.kind() == "import_statement"
697            || node.kind() == "import_from_statement"
698            || node.kind() == "use_declaration"
699        {
700            // This is a simplified implementation
701            // In a full implementation, we'd parse the specific import syntax
702            let import_text = &content[node.start_byte()..node.end_byte()];
703
704            // Extract quoted strings as module names
705            let mut in_quote = false;
706            let mut quote_char = '"';
707            let mut current_module = String::new();
708
709            for ch in import_text.chars() {
710                if ch == '"' || ch == '\'' {
711                    if !in_quote {
712                        in_quote = true;
713                        quote_char = ch;
714                    } else if ch == quote_char {
715                        in_quote = false;
716                        if !current_module.is_empty() {
717                            dependencies.push(current_module.clone());
718                            current_module.clear();
719                        }
720                    }
721                } else if in_quote {
722                    current_module.push(ch);
723                }
724            }
725        }
726
727        dependencies
728    }
729
730    /// Extract signatures for Python
731    fn extract_python_signatures(&self, content: &str, tree: &Tree) -> Result<Vec<AstSignature>> {
732        let mut signatures = Vec::new();
733        let root_node = tree.root_node();
734
735        let query_str = r#"
736            (function_definition 
737                name: (identifier) @func_name
738                parameters: (parameters) @func_params
739            ) @function
740            (class_definition 
741                name: (identifier) @class_name
742            ) @class
743            (import_statement) @import
744            (import_from_statement) @import_from
745        "#;
746
747        let query = Query::new(AstLanguage::Python.tree_sitter_language(), query_str)
748            .map_err(|e| ScribeError::parse(format!("Invalid Python signature query: {}", e)))?;
749
750        let mut cursor = QueryCursor::new();
751        let captures = cursor.matches(&query, root_node, content.as_bytes());
752
753        for match_ in captures {
754            let signature = self.extract_signature_from_match(content, &match_, &query)?;
755            signatures.push(signature);
756        }
757
758        Ok(signatures)
759    }
760
761    /// Extract signatures for other languages (similar pattern)
762    fn extract_javascript_signatures(
763        &self,
764        content: &str,
765        tree: &Tree,
766    ) -> Result<Vec<AstSignature>> {
767        let query_str = r#"
768            (function_declaration
769                name: (identifier) @name
770            ) @function
771
772            (arrow_function) @function
773
774            (class_declaration
775                name: (identifier) @name
776            ) @class
777
778            (import_statement) @import
779            (export_statement) @export
780        "#;
781
782        let query =
783            Query::new(AstLanguage::JavaScript.tree_sitter_language(), query_str).map_err(|e| {
784                ScribeError::parse(format!("Invalid JavaScript signature query: {}", e))
785            })?;
786
787        let root_node = tree.root_node();
788        let mut cursor = tree_sitter::QueryCursor::new();
789        let matches = cursor.matches(&query, root_node, content.as_bytes());
790
791        let mut signatures = Vec::new();
792        for match_ in matches {
793            let signature = self.extract_signature_from_match(content, &match_, &query)?;
794            signatures.push(signature);
795        }
796
797        Ok(signatures)
798    }
799
800    fn extract_typescript_signatures(
801        &self,
802        content: &str,
803        tree: &Tree,
804    ) -> Result<Vec<AstSignature>> {
805        let query_str = r#"
806            (function_declaration
807                name: (identifier) @name
808            ) @function
809
810            (interface_declaration
811                name: (type_identifier) @name
812            ) @interface
813
814            (type_alias_declaration
815                name: (type_identifier) @name
816            ) @type
817
818            (class_declaration
819                name: (identifier) @name
820            ) @class
821
822            (import_statement) @import
823            (export_statement) @export
824        "#;
825
826        let query =
827            Query::new(AstLanguage::TypeScript.tree_sitter_language(), query_str).map_err(|e| {
828                ScribeError::parse(format!("Invalid TypeScript signature query: {}", e))
829            })?;
830
831        let root_node = tree.root_node();
832        let mut cursor = tree_sitter::QueryCursor::new();
833        let matches = cursor.matches(&query, root_node, content.as_bytes());
834
835        let mut signatures = Vec::new();
836        for match_ in matches {
837            let signature = self.extract_signature_from_match(content, &match_, &query)?;
838            signatures.push(signature);
839        }
840
841        Ok(signatures)
842    }
843
844    fn extract_go_signatures(&self, content: &str, tree: &Tree) -> Result<Vec<AstSignature>> {
845        let query_str = r#"
846            (function_declaration
847                name: (identifier) @name
848            ) @function
849
850            (type_declaration
851                (type_spec
852                    name: (type_identifier) @name
853                )
854            ) @type
855
856            (import_declaration) @import
857            (package_clause) @package
858        "#;
859
860        let query = Query::new(AstLanguage::Go.tree_sitter_language(), query_str)
861            .map_err(|e| ScribeError::parse(format!("Invalid Go signature query: {}", e)))?;
862
863        let root_node = tree.root_node();
864        let mut cursor = tree_sitter::QueryCursor::new();
865        let matches = cursor.matches(&query, root_node, content.as_bytes());
866
867        let mut signatures = Vec::new();
868        for match_ in matches {
869            let signature = self.extract_signature_from_match(content, &match_, &query)?;
870            signatures.push(signature);
871        }
872
873        Ok(signatures)
874    }
875
876    fn extract_rust_signatures(&self, content: &str, tree: &Tree) -> Result<Vec<AstSignature>> {
877        let query_str = r#"
878            (function_item
879                name: (identifier) @name
880            ) @function
881
882            (impl_item
883                type: (type_identifier) @type_name
884            ) @impl
885
886            (struct_item
887                name: (type_identifier) @name
888            ) @struct
889
890            (enum_item
891                name: (type_identifier) @name
892            ) @enum
893
894            (trait_item
895                name: (type_identifier) @name
896            ) @trait
897
898            (mod_item
899                name: (identifier) @name
900            ) @module
901
902            (use_declaration) @use
903        "#;
904
905        let query = Query::new(AstLanguage::Rust.tree_sitter_language(), query_str)
906            .map_err(|e| ScribeError::parse(format!("Invalid Rust signature query: {}", e)))?;
907
908        let root_node = tree.root_node();
909        let mut cursor = tree_sitter::QueryCursor::new();
910        let matches = cursor.matches(&query, root_node, content.as_bytes());
911
912        let mut signatures = Vec::new();
913        for match_ in matches {
914            let signature = self.extract_signature_from_match(content, &match_, &query)?;
915            signatures.push(signature);
916        }
917
918        Ok(signatures)
919    }
920
921    /// Extract signature from a query match
922    fn extract_signature_from_match(
923        &self,
924        content: &str,
925        match_: &tree_sitter::QueryMatch,
926        query: &Query,
927    ) -> Result<AstSignature> {
928        let mut signature_text = String::new();
929        let mut signature_type = String::new();
930        let mut name = String::new();
931        let mut line = 0;
932
933        for capture in match_.captures {
934            let capture_name = &query.capture_names()[capture.index as usize];
935            let node = capture.node;
936            let node_text = &content[node.start_byte()..node.end_byte()];
937
938            match capture_name.as_str() {
939                "function" | "class" | "import" | "import_from" => {
940                    signature_text = node_text.lines().next().unwrap_or("").to_string();
941                    signature_type = capture_name.to_string();
942                    line = node.start_position().row + 1;
943                }
944                "func_name" | "class_name" => {
945                    name = node_text.to_string();
946                }
947                _ => {}
948            }
949        }
950
951        Ok(AstSignature {
952            signature: signature_text,
953            signature_type,
954            name,
955            parameters: Vec::new(), // Simplified
956            return_type: None,      // Simplified
957            is_public: false,       // Simplified
958            line,
959        })
960    }
961
962    /// Extract Python import from a single node (optimized, no recursion)
963    fn extract_python_import_node(
964        &self,
965        node: Node,
966        content: &str,
967        imports: &mut Vec<AstImport>,
968    ) -> Result<()> {
969        // Look for import_statement and import_from_statement nodes
970        if node.kind() == "import_statement" {
971            // Handle import statements like "import os" or "import sys as system"
972            for i in 0..node.child_count() {
973                if let Some(child) = node.child(i) {
974                    if child.kind() == "aliased_import" {
975                        // Handle "import sys as system"
976                        if let Some(name_node) = child.child_by_field_name("name") {
977                            let module = self.node_text(name_node, content);
978                            let alias = child
979                                .child_by_field_name("alias")
980                                .map(|alias_node| self.node_text(alias_node, content));
981                            let line_number = name_node.start_position().row + 1;
982
983                            imports.push(AstImport {
984                                module,
985                                alias,
986                                items: vec![],
987                                line_number,
988                                is_relative: false,
989                            });
990                        }
991                    } else if child.kind() == "dotted_as_name" {
992                        // Handle dotted imports with alias like "import package.module as mod"
993                        if let Some(name_node) = child.child_by_field_name("name") {
994                            let module = self.node_text(name_node, content);
995                            let alias = child
996                                .child_by_field_name("alias")
997                                .map(|alias_node| self.node_text(alias_node, content));
998                            let line_number = name_node.start_position().row + 1;
999
1000                            imports.push(AstImport {
1001                                module,
1002                                alias,
1003                                items: vec![],
1004                                line_number,
1005                                is_relative: false,
1006                            });
1007                        }
1008                    } else if child.kind() == "dotted_name" || child.kind() == "identifier" {
1009                        // Handle simple "import os"
1010                        let module = self.node_text(child, content);
1011                        let line_number = child.start_position().row + 1;
1012
1013                        imports.push(AstImport {
1014                            module,
1015                            alias: None,
1016                            items: vec![],
1017                            line_number,
1018                            is_relative: false,
1019                        });
1020                    }
1021                }
1022            }
1023        } else if node.kind() == "import_from_statement" {
1024            let mut module = String::new();
1025            let mut items = Vec::new();
1026            let mut is_relative = false;
1027
1028            if let Some(module_node) = node.child_by_field_name("module_name") {
1029                module = self.node_text(module_node, content);
1030                is_relative = module.starts_with('.');
1031            }
1032
1033            // Get imported items
1034            for i in 0..node.child_count() {
1035                if let Some(child) = node.child(i) {
1036                    if child.kind() == "import_list" {
1037                        for j in 0..child.child_count() {
1038                            if let Some(item) = child.child(j) {
1039                                if item.kind() == "dotted_name" || item.kind() == "identifier" {
1040                                    items.push(self.node_text(item, content));
1041                                }
1042                            }
1043                        }
1044                    }
1045                }
1046            }
1047
1048            let line_number = node.start_position().row + 1;
1049            imports.push(AstImport {
1050                module,
1051                alias: None,
1052                items,
1053                line_number,
1054                is_relative,
1055            });
1056        }
1057
1058        Ok(())
1059    }
1060
1061    /// Extract JavaScript/TypeScript import from a single node (optimized, no recursion)
1062    fn extract_js_ts_import_node(
1063        &self,
1064        node: Node,
1065        content: &str,
1066        imports: &mut Vec<AstImport>,
1067    ) -> Result<()> {
1068        if node.kind() == "import_statement" {
1069            let mut module = String::new();
1070            let items = Vec::new();
1071
1072            // Find the source
1073            for i in 0..node.child_count() {
1074                if let Some(child) = node.child(i) {
1075                    if child.kind() == "string" {
1076                        module = self.node_text(child, content);
1077                        // Remove quotes
1078                        module = module.trim_matches('"').trim_matches('\'').to_string();
1079                        break;
1080                    }
1081                }
1082            }
1083
1084            let line_number = node.start_position().row + 1;
1085            imports.push(AstImport {
1086                module,
1087                alias: None,
1088                items,
1089                line_number,
1090                is_relative: false,
1091            });
1092        }
1093        Ok(())
1094    }
1095
1096    /// Extract Go import from a single node (optimized, no recursion)
1097    fn extract_go_import_node(
1098        &self,
1099        node: Node,
1100        content: &str,
1101        imports: &mut Vec<AstImport>,
1102    ) -> Result<()> {
1103        if node.kind() == "import_spec" {
1104            for i in 0..node.child_count() {
1105                if let Some(child) = node.child(i) {
1106                    if child.kind() == "interpreted_string_literal" {
1107                        let module = self.node_text(child, content);
1108                        let module = module.trim_matches('"').to_string();
1109                        let line_number = child.start_position().row + 1;
1110
1111                        imports.push(AstImport {
1112                            module,
1113                            alias: None,
1114                            items: vec![],
1115                            line_number,
1116                            is_relative: false,
1117                        });
1118                    }
1119                }
1120            }
1121        }
1122        Ok(())
1123    }
1124
1125    /// Extract Rust import from a single node (optimized, no recursion)
1126    fn extract_rust_import_node(
1127        &self,
1128        node: Node,
1129        content: &str,
1130        imports: &mut Vec<AstImport>,
1131    ) -> Result<()> {
1132        if node.kind() == "use_declaration" {
1133            if let Some(use_tree) = node.child_by_field_name("argument") {
1134                let module = self.node_text(use_tree, content);
1135                let line_number = node.start_position().row + 1;
1136
1137                imports.push(AstImport {
1138                    module,
1139                    alias: None,
1140                    items: vec![],
1141                    line_number,
1142                    is_relative: false,
1143                });
1144            }
1145        }
1146        Ok(())
1147    }
1148
1149    /// Helper to extract text from a node
1150    fn node_text(&self, node: Node, content: &str) -> String {
1151        content[node.start_byte()..node.end_byte()].to_string()
1152    }
1153
1154    /// Search for entities (functions, classes, etc.) by name within parsed content
1155    ///
1156    /// Returns locations of all matching entities across the provided content.
1157    pub fn find_entities(
1158        &mut self,
1159        content: &str,
1160        file_path: &str,
1161        query: &EntityQuery,
1162    ) -> Result<Vec<EntityLocation>> {
1163        let chunks = self.parse_chunks(content, file_path)?;
1164        let mut locations = Vec::new();
1165
1166        for chunk in chunks {
1167            if self.matches_query(&chunk, query) {
1168                locations.push(EntityLocation {
1169                    file_path: file_path.to_string(),
1170                    entity_type: chunk.chunk_type.clone(),
1171                    entity_name: chunk.name.clone().unwrap_or_default(),
1172                    start_line: chunk.start_line,
1173                    end_line: chunk.end_line,
1174                    is_public: chunk.is_public,
1175                    content: chunk.content.clone(),
1176                });
1177            }
1178        }
1179
1180        Ok(locations)
1181    }
1182
1183    /// Check if a chunk matches the entity query
1184    fn matches_query(&self, chunk: &AstChunk, query: &EntityQuery) -> bool {
1185        // Match by entity type if specified
1186        if let Some(ref entity_type) = query.entity_type {
1187            if !self.chunk_type_matches(entity_type, &chunk.chunk_type) {
1188                return false;
1189            }
1190        }
1191
1192        // Match by name if specified
1193        if let Some(ref name_pattern) = query.name_pattern {
1194            let chunk_name = chunk.name.as_deref().unwrap_or("");
1195            if query.exact_match {
1196                if chunk_name != name_pattern {
1197                    return false;
1198                }
1199            } else {
1200                // Case-insensitive substring match
1201                if !chunk_name.to_lowercase().contains(&name_pattern.to_lowercase()) {
1202                    return false;
1203                }
1204            }
1205        }
1206
1207        // Match by visibility if specified
1208        if let Some(public_only) = query.public_only {
1209            if public_only && !chunk.is_public {
1210                return false;
1211            }
1212        }
1213
1214        true
1215    }
1216
1217    /// Check if chunk type matches the requested entity type
1218    fn chunk_type_matches(&self, requested: &EntityType, chunk_type: &str) -> bool {
1219        match requested {
1220            EntityType::Function => matches!(chunk_type, "function" | "method"),
1221            EntityType::Class => matches!(chunk_type, "class" | "struct_item" | "trait_item"),
1222            EntityType::Module => matches!(chunk_type, "mod" | "module" | "package"),
1223            EntityType::Interface => matches!(chunk_type, "interface" | "trait_item"),
1224            EntityType::Constant => matches!(chunk_type, "const" | "constant" | "static"),
1225            EntityType::Any => true,
1226        }
1227    }
1228}
1229
1230/// Entity type for search queries
1231#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1232pub enum EntityType {
1233    Function,
1234    Class,
1235    Module,
1236    Interface,
1237    Constant,
1238    Any,
1239}
1240
1241/// Query for finding entities
1242#[derive(Debug, Clone, Serialize, Deserialize)]
1243pub struct EntityQuery {
1244    /// Type of entity to search for (None means any type)
1245    pub entity_type: Option<EntityType>,
1246    /// Name pattern to match (None means any name)
1247    pub name_pattern: Option<String>,
1248    /// Whether to match name exactly (vs substring)
1249    pub exact_match: bool,
1250    /// Only return public/exported entities
1251    pub public_only: Option<bool>,
1252}
1253
1254impl EntityQuery {
1255    /// Create a query for any entity with a specific name
1256    pub fn by_name(name: &str) -> Self {
1257        Self {
1258            entity_type: None,
1259            name_pattern: Some(name.to_string()),
1260            exact_match: false,
1261            public_only: None,
1262        }
1263    }
1264
1265    /// Create a query for a specific entity type
1266    pub fn by_type(entity_type: EntityType) -> Self {
1267        Self {
1268            entity_type: Some(entity_type),
1269            name_pattern: None,
1270            exact_match: false,
1271            public_only: None,
1272        }
1273    }
1274
1275    /// Create a query for a specific function by name
1276    pub fn function(name: &str) -> Self {
1277        Self {
1278            entity_type: Some(EntityType::Function),
1279            name_pattern: Some(name.to_string()),
1280            exact_match: false,
1281            public_only: None,
1282        }
1283    }
1284
1285    /// Create a query for a specific class/struct by name
1286    pub fn class(name: &str) -> Self {
1287        Self {
1288            entity_type: Some(EntityType::Class),
1289            name_pattern: Some(name.to_string()),
1290            exact_match: false,
1291            public_only: None,
1292        }
1293    }
1294
1295    /// Create a query for a specific module by path
1296    pub fn module(path: &str) -> Self {
1297        Self {
1298            entity_type: Some(EntityType::Module),
1299            name_pattern: Some(path.to_string()),
1300            exact_match: false,
1301            public_only: None,
1302        }
1303    }
1304
1305    /// Set whether to match exactly
1306    pub fn exact(mut self) -> Self {
1307        self.exact_match = true;
1308        self
1309    }
1310
1311    /// Only match public/exported entities
1312    pub fn public(mut self) -> Self {
1313        self.public_only = Some(true);
1314        self
1315    }
1316}
1317
1318/// Location of an entity in the codebase
1319#[derive(Debug, Clone, Serialize, Deserialize)]
1320pub struct EntityLocation {
1321    /// File path containing the entity
1322    pub file_path: String,
1323    /// Type of entity (function, class, etc.)
1324    pub entity_type: String,
1325    /// Name of the entity
1326    pub entity_name: String,
1327    /// Start line number (1-indexed)
1328    pub start_line: usize,
1329    /// End line number (1-indexed)
1330    pub end_line: usize,
1331    /// Whether this entity is public/exported
1332    pub is_public: bool,
1333    /// Full content of the entity
1334    pub content: String,
1335}
1336
1337impl EntityLocation {
1338    /// Get a unique identifier for this entity
1339    pub fn identifier(&self) -> String {
1340        format!("{}::{}", self.file_path, self.entity_name)
1341    }
1342}
1343
1344impl Default for AstParser {
1345    fn default() -> Self {
1346        Self::new().expect("Failed to create AstParser")
1347    }
1348}
1349
1350#[cfg(test)]
1351mod tests {
1352    use super::*;
1353
1354    #[test]
1355    fn test_ast_parser_creation() {
1356        let parser = AstParser::new();
1357        assert!(parser.is_ok());
1358    }
1359
1360    #[test]
1361    fn test_language_detection() {
1362        assert_eq!(AstLanguage::from_extension("py"), Some(AstLanguage::Python));
1363        assert_eq!(
1364            AstLanguage::from_extension("js"),
1365            Some(AstLanguage::JavaScript)
1366        );
1367        assert_eq!(
1368            AstLanguage::from_extension("ts"),
1369            Some(AstLanguage::TypeScript)
1370        );
1371        assert_eq!(AstLanguage::from_extension("go"), Some(AstLanguage::Go));
1372        assert_eq!(AstLanguage::from_extension("rs"), Some(AstLanguage::Rust));
1373        assert_eq!(AstLanguage::from_extension("unknown"), None);
1374    }
1375
1376    #[test]
1377    fn test_python_parsing() {
1378        let mut parser = AstParser::new().unwrap();
1379        let content = r#"
1380import os
1381import sys
1382
1383def hello_world():
1384    """A simple function."""
1385    print("Hello, world!")
1386
1387class Calculator:
1388    """A simple calculator."""
1389    
1390    def add(self, a, b):
1391        return a + b
1392"#;
1393
1394        let chunks = parser.parse_chunks(content, "test.py").unwrap();
1395        assert!(!chunks.is_empty());
1396
1397        // Should find imports, function, and class
1398        let chunk_types: Vec<&str> = chunks.iter().map(|c| c.chunk_type.as_str()).collect();
1399        assert!(chunk_types.contains(&"import"));
1400        assert!(chunk_types.contains(&"function"));
1401        assert!(chunk_types.contains(&"class"));
1402    }
1403
1404    #[test]
1405    fn test_rust_parsing() {
1406        let mut parser = AstParser::new().unwrap();
1407        let content = r#"
1408use std::collections::HashMap;
1409
1410pub struct DataProcessor {
1411    data: HashMap<String, i32>,
1412}
1413
1414impl DataProcessor {
1415    pub fn new() -> Self {
1416        Self {
1417            data: HashMap::new(),
1418        }
1419    }
1420}
1421"#;
1422
1423        let chunks = parser.parse_chunks(content, "test.rs").unwrap();
1424        assert!(!chunks.is_empty());
1425
1426        let chunk_types: Vec<&str> = chunks.iter().map(|c| c.chunk_type.as_str()).collect();
1427        assert!(chunk_types.contains(&"use"));
1428        assert!(chunk_types.contains(&"struct"));
1429        assert!(chunk_types.contains(&"impl"));
1430    }
1431
1432    #[test]
1433    fn test_signature_extraction() {
1434        let mut parser = AstParser::new().unwrap();
1435        let content = r#"
1436def calculate(a: int, b: int) -> int:
1437    return a + b
1438
1439class Calculator:
1440    def multiply(self, x, y):
1441        return x * y
1442"#;
1443
1444        let signatures = parser.extract_signatures(content, "test.py").unwrap();
1445        assert!(!signatures.is_empty());
1446    }
1447}