scribe_analysis/
ast_import_parser.rs

1//! Optimized AST-based import extraction for analysis module
2//!
3//! This module provides a high-performance AST parser specifically for extracting
4//! import statements from source code using TreeCursor for efficient traversal
5//! and parser reuse for better performance.
6
7use once_cell::sync::Lazy;
8use rayon::prelude::*;
9use scribe_core::Result;
10use std::collections::HashMap;
11use std::sync::{Arc, Mutex};
12use tree_sitter::{Language, Node, Parser, Tree, TreeCursor};
13
14/// Simple import information
15#[derive(Debug, Clone)]
16pub struct SimpleImport {
17    /// The module being imported
18    pub module: String,
19    /// Line number where the import appears
20    pub line_number: usize,
21}
22
23/// Supported programming languages for import extraction
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
25pub enum ImportLanguage {
26    Python,
27    JavaScript,
28    TypeScript,
29    Go,
30    Rust,
31}
32
33impl ImportLanguage {
34    /// Get the tree-sitter language for this language
35    pub fn tree_sitter_language(&self) -> Language {
36        match self {
37            ImportLanguage::Python => tree_sitter_python::language(),
38            ImportLanguage::JavaScript => tree_sitter_javascript::language(),
39            ImportLanguage::TypeScript => tree_sitter_typescript::language_typescript(),
40            ImportLanguage::Go => tree_sitter_go::language(),
41            ImportLanguage::Rust => tree_sitter_rust::language(),
42        }
43    }
44
45    /// Detect language from file extension
46    pub fn from_extension(ext: &str) -> Option<Self> {
47        match ext.to_lowercase().as_str() {
48            "py" | "pyi" | "pyw" => Some(ImportLanguage::Python),
49            "js" | "mjs" | "cjs" => Some(ImportLanguage::JavaScript),
50            "ts" | "mts" | "cts" => Some(ImportLanguage::TypeScript),
51            "go" => Some(ImportLanguage::Go),
52            "rs" => Some(ImportLanguage::Rust),
53            _ => None,
54        }
55    }
56}
57
58/// Thread-safe parser pool for reusing parsers
59static PARSER_POOL: Lazy<Arc<Mutex<HashMap<ImportLanguage, Vec<Parser>>>>> =
60    Lazy::new(|| Arc::new(Mutex::new(HashMap::new())));
61
62/// Node types that can contain imports - for fast filtering
63const IMPORT_NODE_TYPES: &[&str] = &[
64    "import_statement",
65    "import_from_statement",
66    "use_declaration",
67    "import_declaration",
68    "import_spec",
69    "source_file",
70    "module",
71];
72
73/// Optimized AST parser for import extraction with parser reuse and TreeCursor traversal
74pub struct SimpleAstParser {
75    // We don't need to store parsers anymore - we use the pool
76}
77
78impl std::fmt::Debug for SimpleAstParser {
79    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
80        f.debug_struct("SimpleAstParser")
81            .field("parsers", &"[reusable pool]")
82            .finish()
83    }
84}
85
86impl SimpleAstParser {
87    /// Create a new simple AST parser
88    pub fn new() -> Result<Self> {
89        // Initialize the parser pool on first creation
90        Self::ensure_parser_pool_initialized()?;
91        Ok(Self {})
92    }
93
94    /// Ensure the parser pool is initialized with all supported languages
95    fn ensure_parser_pool_initialized() -> Result<()> {
96        let mut pool = PARSER_POOL.lock().unwrap();
97
98        for language in [
99            ImportLanguage::Python,
100            ImportLanguage::JavaScript,
101            ImportLanguage::TypeScript,
102            ImportLanguage::Go,
103            ImportLanguage::Rust,
104        ] {
105            if !pool.contains_key(&language) {
106                let mut parser = Parser::new();
107                parser
108                    .set_language(language.tree_sitter_language())
109                    .map_err(|e| {
110                        scribe_core::ScribeError::parse(format!(
111                            "Failed to set tree-sitter language: {}",
112                            e
113                        ))
114                    })?;
115                pool.insert(language, vec![parser]);
116            }
117        }
118
119        Ok(())
120    }
121
122    /// Get a parser from the pool or create a new one
123    fn get_parser(&self, language: ImportLanguage) -> Result<Parser> {
124        let mut pool = PARSER_POOL.lock().unwrap();
125
126        if let Some(parsers) = pool.get_mut(&language) {
127            if let Some(parser) = parsers.pop() {
128                return Ok(parser);
129            }
130        }
131
132        // Create a new parser if pool is empty
133        let mut parser = Parser::new();
134        parser
135            .set_language(language.tree_sitter_language())
136            .map_err(|e| {
137                scribe_core::ScribeError::parse(format!(
138                    "Failed to set tree-sitter language: {}",
139                    e
140                ))
141            })?;
142        Ok(parser)
143    }
144
145    /// Return a parser to the pool
146    fn return_parser(&self, language: ImportLanguage, parser: Parser) {
147        let mut pool = PARSER_POOL.lock().unwrap();
148        pool.entry(language).or_insert_with(Vec::new).push(parser);
149    }
150
151    /// Extract imports from the given content using optimized tree-sitter traversal
152    pub fn extract_imports(
153        &self,
154        content: &str,
155        language: ImportLanguage,
156    ) -> Result<Vec<SimpleImport>> {
157        // Get parser from pool
158        let mut parser = self.get_parser(language)?;
159
160        let tree = parser
161            .parse(content, None)
162            .ok_or_else(|| scribe_core::ScribeError::parse("Failed to parse content"))?;
163
164        let mut imports = Vec::new();
165
166        // Use TreeCursor for efficient traversal
167        let mut cursor = tree.walk();
168        self.extract_imports_with_cursor(&mut cursor, content, language, &mut imports)?;
169
170        // Return parser to pool
171        self.return_parser(language, parser);
172
173        Ok(imports)
174    }
175
176    /// Extract imports using TreeCursor for optimal performance
177    fn extract_imports_with_cursor(
178        &self,
179        cursor: &mut TreeCursor,
180        content: &str,
181        language: ImportLanguage,
182        imports: &mut Vec<SimpleImport>,
183    ) -> Result<()> {
184        let node = cursor.node();
185
186        // Fast filter: skip nodes that can't contain imports
187        if !self.node_can_contain_imports(node.kind()) {
188            return Ok(());
189        }
190
191        // Process current node if it's an import
192        if self.is_import_node(node.kind()) {
193            self.extract_import_from_node(node, content, language, imports)?;
194        }
195
196        // Traverse children using cursor (much faster than child(i) loops)
197        if cursor.goto_first_child() {
198            loop {
199                self.extract_imports_with_cursor(cursor, content, language, imports)?;
200                if !cursor.goto_next_sibling() {
201                    break;
202                }
203            }
204            cursor.goto_parent();
205        }
206
207        Ok(())
208    }
209
210    /// Check if a node type can contain imports (fast filter)
211    fn node_can_contain_imports(&self, kind: &str) -> bool {
212        IMPORT_NODE_TYPES.contains(&kind)
213            || kind.contains("import")
214            || kind.contains("use")
215            || kind == "program"
216            || kind == "translation_unit"
217            || kind == "block"
218            || kind == "statement_block"
219    }
220
221    /// Check if a node is an import statement
222    fn is_import_node(&self, kind: &str) -> bool {
223        matches!(
224            kind,
225            "import_statement"
226                | "import_from_statement"
227                | "use_declaration"
228                | "import_declaration"
229                | "import_spec"
230        )
231    }
232
233    /// Extract import from a specific node (no recursion needed)
234    fn extract_import_from_node(
235        &self,
236        node: Node,
237        content: &str,
238        language: ImportLanguage,
239        imports: &mut Vec<SimpleImport>,
240    ) -> Result<()> {
241        match language {
242            ImportLanguage::Python => {
243                self.extract_python_import_node(node, content, imports)?;
244            }
245            ImportLanguage::JavaScript | ImportLanguage::TypeScript => {
246                self.extract_js_ts_import_node(node, content, imports)?;
247            }
248            ImportLanguage::Go => {
249                self.extract_go_import_node(node, content, imports)?;
250            }
251            ImportLanguage::Rust => {
252                self.extract_rust_import_node(node, content, imports)?;
253            }
254        }
255        Ok(())
256    }
257
258    /// Extract Python import from a single node (optimized, no recursion)
259    fn extract_python_import_node(
260        &self,
261        node: Node,
262        content: &str,
263        imports: &mut Vec<SimpleImport>,
264    ) -> Result<()> {
265        if node.kind() == "import_statement" {
266            // Handle import statements like "import os" or "import sys as system"
267            let mut cursor = node.walk();
268            if cursor.goto_first_child() {
269                loop {
270                    let child = cursor.node();
271                    if child.kind() == "dotted_name" || child.kind() == "identifier" {
272                        let module = self.node_text(child, content);
273                        let line_number = child.start_position().row + 1;
274
275                        imports.push(SimpleImport {
276                            module,
277                            line_number,
278                        });
279                    }
280                    if !cursor.goto_next_sibling() {
281                        break;
282                    }
283                }
284            }
285        } else if node.kind() == "import_from_statement" {
286            if let Some(module_node) = node.child_by_field_name("module_name") {
287                let module = self.node_text(module_node, content);
288                let line_number = node.start_position().row + 1;
289                imports.push(SimpleImport {
290                    module,
291                    line_number,
292                });
293            }
294        }
295        Ok(())
296    }
297
298    /// Extract JavaScript/TypeScript import from a single node (optimized, no recursion)
299    fn extract_js_ts_import_node(
300        &self,
301        node: Node,
302        content: &str,
303        imports: &mut Vec<SimpleImport>,
304    ) -> Result<()> {
305        if node.kind() == "import_statement" {
306            // Find the source
307            let mut cursor = node.walk();
308            if cursor.goto_first_child() {
309                loop {
310                    let child = cursor.node();
311                    if child.kind() == "string" {
312                        let mut module = self.node_text(child, content);
313                        // Remove quotes
314                        module = module.trim_matches('"').trim_matches('\'').to_string();
315                        let line_number = node.start_position().row + 1;
316                        imports.push(SimpleImport {
317                            module,
318                            line_number,
319                        });
320                        break;
321                    }
322                    if !cursor.goto_next_sibling() {
323                        break;
324                    }
325                }
326            }
327        }
328        Ok(())
329    }
330
331    /// Extract Go import from a single node (optimized, no recursion)
332    fn extract_go_import_node(
333        &self,
334        node: Node,
335        content: &str,
336        imports: &mut Vec<SimpleImport>,
337    ) -> Result<()> {
338        if node.kind() == "import_spec" {
339            let mut cursor = node.walk();
340            if cursor.goto_first_child() {
341                loop {
342                    let child = cursor.node();
343                    if child.kind() == "interpreted_string_literal" {
344                        let module = self.node_text(child, content);
345                        let module = module.trim_matches('"').to_string();
346                        let line_number = child.start_position().row + 1;
347
348                        imports.push(SimpleImport {
349                            module,
350                            line_number,
351                        });
352                    }
353                    if !cursor.goto_next_sibling() {
354                        break;
355                    }
356                }
357            }
358        }
359        Ok(())
360    }
361
362    /// Extract Rust import from a single node (optimized, no recursion)
363    fn extract_rust_import_node(
364        &self,
365        node: Node,
366        content: &str,
367        imports: &mut Vec<SimpleImport>,
368    ) -> Result<()> {
369        if node.kind() == "use_declaration" {
370            if let Some(use_tree) = node.child_by_field_name("argument") {
371                let module = self.node_text(use_tree, content);
372                let line_number = node.start_position().row + 1;
373
374                imports.push(SimpleImport {
375                    module,
376                    line_number,
377                });
378            }
379        }
380        Ok(())
381    }
382
383    /// Helper to extract text from a node
384    fn node_text(&self, node: Node, content: &str) -> String {
385        content[node.start_byte()..node.end_byte()].to_string()
386    }
387
388    /// Extract imports from multiple files in parallel for maximum performance
389    pub fn extract_imports_parallel(
390        &self,
391        files: &[(String, String, ImportLanguage)], // (path, content, language)
392    ) -> Result<Vec<(String, Vec<SimpleImport>)>> {
393        // Use rayon for parallel processing
394        files
395            .par_iter()
396            .map(|(path, content, language)| {
397                let imports = self.extract_imports(content, *language)?;
398                Ok((path.clone(), imports))
399            })
400            .collect()
401    }
402
403    /// Batch extract imports for multiple contents with the same language
404    pub fn extract_imports_batch(
405        &self,
406        contents: &[&str],
407        language: ImportLanguage,
408    ) -> Result<Vec<Vec<SimpleImport>>> {
409        contents
410            .par_iter()
411            .map(|content| self.extract_imports(content, language))
412            .collect()
413    }
414}
415
416impl Default for SimpleAstParser {
417    fn default() -> Self {
418        Self::new().expect("Failed to create SimpleAstParser")
419    }
420}