Skip to main content

oli_tui/tools/code/
parser.rs

1use anyhow::{Context, Result};
2use ignore::WalkBuilder;
3use lazy_static::lazy_static;
4use regex::Regex;
5use serde::{Deserialize, Serialize};
6use std::collections::HashMap;
7use std::fs;
8use std::path::{Path, PathBuf};
9use std::sync::{Arc, Mutex};
10use tree_sitter::{Language, Node, Parser, Query};
11
12// Helper struct to reduce function argument count
13struct AstNodeParams<'a> {
14    language: &'a str,
15    kind: &'a str,
16    line_num: usize,
17    line: &'a str,
18    capture: &'a str,
19    match_start: usize,
20    match_end: usize,
21}
22
23/// A representation of code structure that will be sent to the LLM
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct CodeAST {
26    pub path: String,
27    pub language: String,
28    pub kind: String,
29    pub name: Option<String>,
30    pub range: Range,
31    pub children: Vec<CodeAST>,
32    pub content: Option<String>,
33}
34
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct Range {
37    pub start_row: usize,
38    pub start_column: usize,
39    pub end_row: usize,
40    pub end_column: usize,
41}
42
43lazy_static! {
44    // Language queries for extracting important code structures
45    static ref RUST_QUERY: &'static str = r#"
46        (struct_item name: (identifier) @struct.name) @struct.def
47        (enum_item name: (identifier) @enum.name) @enum.def
48        (trait_item name: (identifier) @trait.name) @trait.def
49        (impl_item type: (type_identifier) @impl.type) @impl.def
50        (function_item name: (identifier) @function.name) @function.def
51        (mod_item name: (identifier) @module.name) @module.def
52    "#;
53
54    static ref JAVASCRIPT_QUERY: &'static str = r#"
55        (class_declaration name: (identifier) @class.name) @class.def
56        (function_declaration name: (identifier) @function.name) @function.def
57        (method_definition name: (property_identifier) @method.name) @method.def
58        (lexical_declaration 
59            (variable_declarator 
60                name: (identifier) @const.name 
61                value: (arrow_function) @const.value)) @const.def
62    "#;
63
64    static ref PYTHON_QUERY: &'static str = r#"
65        (class_definition name: (identifier) @class.name) @class.def
66        (function_definition name: (identifier) @function.name) @function.def
67    "#;
68
69    static ref GO_QUERY: &'static str = r#"
70        (type_declaration (type_spec name: (type_identifier) @type.name)) @type.def
71        (function_declaration name: (identifier) @function.name) @function.def
72        (method_declaration name: (field_identifier) @method.name) @method.def
73        (struct_type) @struct.def
74        (interface_type) @interface.def
75    "#;
76
77    // Simple regex fallbacks
78    static ref RUST_STRUCT_RE: Regex = Regex::new(r"struct\s+([A-Za-z0-9_]+)").unwrap();
79    static ref RUST_ENUM_RE: Regex = Regex::new(r"enum\s+([A-Za-z0-9_]+)").unwrap();
80    static ref RUST_IMPL_RE: Regex = Regex::new(r"impl(?:\s+<[^>]+>)?\s+([A-Za-z0-9_:]+)").unwrap();
81    static ref RUST_FN_RE: Regex = Regex::new(r"fn\s+([A-Za-z0-9_]+)").unwrap();
82    static ref RUST_TRAIT_RE: Regex = Regex::new(r"trait\s+([A-Za-z0-9_]+)").unwrap();
83    static ref RUST_MOD_RE: Regex = Regex::new(r"mod\s+([A-Za-z0-9_]+)").unwrap();
84
85    static ref JS_CLASS_RE: Regex = Regex::new(r"class\s+([A-Za-z0-9_]+)").unwrap();
86    static ref JS_FUNCTION_RE: Regex = Regex::new(r"function\s+([A-Za-z0-9_]+)").unwrap();
87    static ref JS_ARROW_FN_RE: Regex = Regex::new(r"const\s+([A-Za-z0-9_]+)\s*=\s*\([^)]*\)\s*=>").unwrap();
88    static ref JS_INTERFACE_RE: Regex = Regex::new(r"interface\s+([A-Za-z0-9_]+)").unwrap();
89    static ref JS_TYPE_RE: Regex = Regex::new(r"type\s+([A-Za-z0-9_]+)").unwrap();
90
91    static ref PY_CLASS_RE: Regex = Regex::new(r"class\s+([A-Za-z0-9_]+)").unwrap();
92    static ref PY_FUNCTION_RE: Regex = Regex::new(r"def\s+([A-Za-z0-9_]+)").unwrap();
93    static ref PY_ASYNC_FN_RE: Regex = Regex::new(r"async\s+def\s+([A-Za-z0-9_]+)").unwrap();
94
95    static ref GENERIC_BLOCK_RE: Regex = Regex::new(r"^\s*[{}]").unwrap();
96
97    // Cache parsers and languages
98    static ref LANGUAGE_CACHE: Arc<Mutex<HashMap<String, Language>>> = Arc::new(Mutex::new(HashMap::new()));
99    static ref QUERY_CACHE: Arc<Mutex<HashMap<String, String>>> = Arc::new(Mutex::new(HashMap::new()));
100}
101
102pub struct CodeParser {
103    languages: HashMap<String, Vec<String>>,
104    parser: Parser,
105}
106
107impl CodeParser {
108    pub fn new() -> Result<Self> {
109        let mut languages = HashMap::new();
110
111        // Define supported languages (extensible for future needs)
112        languages.insert("rust".to_string(), vec!["rs".to_string()]);
113        languages.insert(
114            "javascript".to_string(),
115            vec!["js".to_string(), "jsx".to_string()],
116        );
117        languages.insert(
118            "typescript".to_string(),
119            vec!["ts".to_string(), "tsx".to_string()],
120        );
121        languages.insert("python".to_string(), vec!["py".to_string()]);
122        languages.insert("go".to_string(), vec!["go".to_string()]);
123        languages.insert("c".to_string(), vec!["c".to_string(), "h".to_string()]);
124        languages.insert(
125            "cpp".to_string(),
126            vec![
127                "cpp".to_string(),
128                "cc".to_string(),
129                "cxx".to_string(),
130                "hpp".to_string(),
131                "hxx".to_string(),
132            ],
133        );
134        languages.insert("java".to_string(), vec!["java".to_string()]);
135
136        // Initialize parser
137        let parser = Parser::new();
138
139        // Initialize language cache with known languages
140        {
141            let mut cache = LANGUAGE_CACHE.lock().unwrap();
142            if cache.is_empty() {
143                // Load languages with the tree-sitter bindings
144                let rust_lang: Language = tree_sitter_rust::LANGUAGE.into();
145                cache.insert("rust".to_string(), rust_lang);
146
147                let js_lang: Language = tree_sitter_javascript::LANGUAGE.into();
148                cache.insert("javascript".to_string(), js_lang.clone());
149                cache.insert("typescript".to_string(), js_lang); // TypeScript uses JS grammar for basic parsing
150
151                let py_lang: Language = tree_sitter_python::LANGUAGE.into();
152                cache.insert("python".to_string(), py_lang);
153
154                let c_lang: Language = tree_sitter_c::LANGUAGE.into();
155                cache.insert("c".to_string(), c_lang);
156
157                let cpp_lang: Language = tree_sitter_cpp::LANGUAGE.into();
158                cache.insert("cpp".to_string(), cpp_lang);
159
160                let go_lang: Language = tree_sitter_go::LANGUAGE.into();
161                cache.insert("go".to_string(), go_lang);
162
163                let java_lang: Language = tree_sitter_java::LANGUAGE.into();
164                cache.insert("java".to_string(), java_lang);
165            }
166        }
167
168        // Initialize query cache with known language queries
169        {
170            let mut cache = QUERY_CACHE.lock().unwrap();
171            if cache.is_empty() {
172                cache.insert("rust".to_string(), RUST_QUERY.to_string());
173                cache.insert("javascript".to_string(), JAVASCRIPT_QUERY.to_string());
174                cache.insert("typescript".to_string(), JAVASCRIPT_QUERY.to_string());
175                cache.insert("python".to_string(), PYTHON_QUERY.to_string());
176                cache.insert("go".to_string(), GO_QUERY.to_string());
177            }
178        }
179
180        Ok(Self { languages, parser })
181    }
182
183    /// Try to get tree-sitter language for parsing
184    fn get_language(&self, language_name: &str) -> Option<Language> {
185        let cache = LANGUAGE_CACHE.lock().unwrap();
186        cache.get(language_name).cloned()
187    }
188
189    /// Try to get query for a language
190    fn get_query(&self, language_name: &str) -> Option<Query> {
191        let query_cache = QUERY_CACHE.lock().unwrap();
192        if let Some(query_string) = query_cache.get(language_name) {
193            if let Some(lang) = self.get_language(language_name) {
194                return Query::new(&lang, query_string).ok();
195            }
196        }
197        None
198    }
199
200    /// Determine language from file extension
201    pub fn detect_language(&self, path: &Path) -> Option<String> {
202        let extension = path.extension()?.to_str()?.to_lowercase();
203
204        // Special handling for TypeScript/JavaScript
205        if extension == "ts" || extension == "tsx" {
206            return Some("typescript".to_string());
207        } else if extension == "js" || extension == "jsx" {
208            return Some("javascript".to_string());
209        }
210
211        // General language detection
212        for (lang, extensions) in &self.languages {
213            if extensions.iter().any(|ext| ext == &extension) {
214                return Some(lang.clone());
215            }
216        }
217
218        None
219    }
220
221    /// Parse a single file using tree-sitter and generate AST with size optimizations
222    pub fn parse_file(&mut self, path: &Path) -> Result<CodeAST> {
223        // Detect language
224        let language_name = self
225            .detect_language(path)
226            .context(format!("Could not detect language for file: {:?}", path))?;
227
228        // Read file content - limit file size for very large files
229        let metadata = fs::metadata(path)?;
230
231        // Skip files larger than 1MB to avoid processing too much data
232        if metadata.len() > 1_000_000 {
233            return Ok(CodeAST {
234                path: path.to_string_lossy().to_string(),
235                language: language_name.to_string(),
236                kind: "file".to_string(),
237                name: path
238                    .file_name()
239                    .and_then(|n| n.to_str())
240                    .map(|s| s.to_string()),
241                range: Range {
242                    start_row: 0,
243                    start_column: 0,
244                    end_row: 0,
245                    end_column: 0,
246                },
247                children: vec![CodeAST {
248                    path: String::new(),
249                    language: language_name.to_string(),
250                    kind: "large_file".to_string(),
251                    name: Some("File too large for AST generation".to_string()),
252                    range: Range {
253                        start_row: 0,
254                        start_column: 0,
255                        end_row: 0,
256                        end_column: 0,
257                    },
258                    children: Vec::new(),
259                    content: Some(format!(
260                        "File size: {} bytes - too large for detailed parsing",
261                        metadata.len()
262                    )),
263                }],
264                content: None,
265            });
266        }
267
268        // Read file content
269        let source_code = fs::read_to_string(path)?;
270
271        // Create the base AST node for the file
272        let mut ast = CodeAST {
273            path: path.to_string_lossy().to_string(),
274            language: language_name.to_string(),
275            kind: "file".to_string(),
276            name: path
277                .file_name()
278                .and_then(|n| n.to_str())
279                .map(|s| s.to_string()),
280            range: Range {
281                start_row: 0,
282                start_column: 0,
283                end_row: source_code.lines().count(),
284                end_column: 0,
285            },
286            children: Vec::new(),
287            content: None,
288        };
289
290        // Try to use tree-sitter for parsing
291        if let Some(language) = self.get_language(&language_name) {
292            // Configure parser
293            self.parser.set_language(&language)?;
294
295            // Parse the source code
296            if let Some(tree) = self.parser.parse(&source_code, None) {
297                // Try to use tree-sitter queries to extract structured information
298                if let Some(_query) = self.get_query(&language_name) {
299                    // Skip tree-sitter query-based parsing for now since we're having compatibility issues
300                    // We'll rely on more basic parsing methods instead
301                    let root_node = tree.root_node();
302                    let root_type = root_node.kind();
303
304                    // Add some basic information about the root node
305                    let child_ast = CodeAST {
306                        path: String::new(),
307                        language: language_name.to_string(),
308                        kind: "file_root".to_string(),
309                        name: Some(root_type.to_string()),
310                        range: Range {
311                            start_row: root_node.start_position().row,
312                            start_column: root_node.start_position().column,
313                            end_row: root_node.end_position().row,
314                            end_column: root_node.end_position().column,
315                        },
316                        children: Vec::new(),
317                        content: Some(format!("Root node type: {}", root_type)),
318                    };
319
320                    ast.children.push(child_ast);
321                }
322
323                // If tree-sitter worked and found structures, return the AST
324                if !ast.children.is_empty() {
325                    return Ok(ast);
326                }
327
328                // If tree-sitter query didn't find anything useful, try traversing the syntax tree
329                // Limit to top-level nodes only to reduce size
330                let mut node_children =
331                    self.extract_important_nodes(tree.root_node(), &source_code, &language_name);
332
333                // Limit to 30 children max
334                if node_children.len() > 30 {
335                    node_children.truncate(30);
336                }
337
338                if !node_children.is_empty() {
339                    ast.children = node_children;
340                    return Ok(ast);
341                }
342            }
343        }
344
345        // Fallback to simplified regex-based parsing
346        // We're optimizing for conciseness, so limit capture size
347        self.create_simplified_ast(path, &language_name, &source_code)
348    }
349
350    /// Extract important nodes from a tree-sitter syntax tree using generic traversal
351    fn extract_important_nodes(
352        &self,
353        node: Node<'_>,
354        source: &str,
355        language: &str,
356    ) -> Vec<CodeAST> {
357        let mut result = Vec::new();
358        let important_node_types = match language {
359            "rust" => &[
360                "struct_item",
361                "enum_item",
362                "impl_item",
363                "function_item",
364                "trait_item",
365                "mod_item",
366                "macro_definition",
367            ],
368            "javascript" | "typescript" => &[
369                "class_declaration",
370                "function_declaration",
371                "method_definition",
372                "lexical_declaration",
373                "interface_declaration",
374                "export_statement",
375                "variable_declaration", // Add an extra item to match the size of the rust array
376            ],
377            "python" => &[
378                "class_definition",
379                "function_definition",
380                "decorated_definition",
381                "import_statement",
382                "assignment",
383                "expression_statement",
384                "return_statement", // Added entries to match array size
385            ],
386            "go" => &[
387                "function_declaration",
388                "method_declaration",
389                "type_declaration",
390                "struct_type",
391                "interface_type",
392                "package_clause",
393                "import_declaration", // Added to match array size
394            ],
395            "c" | "cpp" => &[
396                "function_definition",
397                "class_specifier",
398                "struct_specifier",
399                "enum_specifier",
400                "namespace_definition",
401                "template_declaration",
402                "declaration", // Added to match
403            ],
404            "java" => &[
405                "class_declaration",
406                "method_declaration",
407                "interface_declaration",
408                "constructor_declaration",
409                "field_declaration",
410                "import_declaration",
411                "package_declaration", // Added to match
412            ],
413            _ => &[
414                "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", "unknown",
415            ], // Dummy values to match size
416        };
417
418        // Check if this node is important
419        if important_node_types.contains(&node.kind()) {
420            self.process_important_node(node, source, language, &mut result);
421        }
422
423        // Recursively process child nodes
424        let mut cursor = node.walk();
425        for child in node.children(&mut cursor) {
426            // Skip tokens and trivial nodes
427            if child.child_count() > 0 && !child.is_named() {
428                let child_results = self.extract_important_nodes(child, source, language);
429                result.extend(child_results);
430            }
431        }
432
433        result
434    }
435
436    /// Process an individual node that has been identified as important - optimized for size
437    fn process_important_node(
438        &self,
439        node: Node<'_>,
440        source: &str,
441        language: &str,
442        result: &mut Vec<CodeAST>,
443    ) {
444        // Try to find a name for this node
445        let mut name = None;
446        let mut cursor = node.walk();
447
448        // Look for identifier nodes that might contain the name
449        for child in node.children(&mut cursor) {
450            if child.kind() == "identifier"
451                || child.kind() == "type_identifier"
452                || child.kind() == "field_identifier"
453                || child.kind() == "property_identifier"
454            {
455                if let Ok(text) = child.utf8_text(source.as_bytes()) {
456                    name = Some(text.to_string());
457                    break;
458                }
459            }
460        }
461
462        // Extract just the first line of content as a preview
463        let content = node
464            .utf8_text(source.as_bytes())
465            .ok()
466            .and_then(|s| s.lines().next())
467            .map(|first_line| {
468                // Limit content length
469                if first_line.len() > 100 {
470                    format!("{}...", &first_line[..100])
471                } else {
472                    first_line.to_string()
473                }
474            });
475
476        // Create a minimal AST node for this important node
477        let ast_node = CodeAST {
478            path: String::new(),
479            language: language.to_string(),
480            kind: node.kind().to_string(),
481            name,
482            range: Range {
483                start_row: node.start_position().row,
484                start_column: 0, // Skip column info to save space
485                end_row: node.end_position().row,
486                end_column: 0, // Skip column info to save space
487            },
488            children: Vec::new(),
489            content,
490        };
491
492        result.push(ast_node);
493    }
494
495    /// Fallback method: Create a simplified AST using regex - optimized for size
496    pub fn create_simplified_ast(
497        &self,
498        path: &Path,
499        language: &str,
500        source_code: &str,
501    ) -> Result<CodeAST> {
502        // Limit input size for regex processing
503        let limited_source = if source_code.len() > 50_000 {
504            // Only process first ~50KB to avoid regex performance issues
505            let truncated: String = source_code.chars().take(50_000).collect();
506            truncated
507        } else {
508            source_code.to_string()
509        };
510
511        let lines: Vec<&str> = limited_source.lines().collect();
512
513        // Create basic AST structure
514        let mut ast = CodeAST {
515            path: path.to_string_lossy().to_string(),
516            language: language.to_string(),
517            kind: "file".to_string(),
518            name: path
519                .file_name()
520                .and_then(|n| n.to_str())
521                .map(|s| s.to_string()),
522            range: Range {
523                start_row: 0,
524                start_column: 0,
525                end_row: lines.len(),
526                end_column: 0, // Skip end column to save space
527            },
528            children: Vec::new(),
529            content: None,
530        };
531
532        // Extract top-level structures based on language, limit to most relevant ones
533        let mut children = match language {
534            "rust" => self.extract_rust_constructs(&limited_source),
535            "javascript" | "typescript" => self.extract_js_constructs(&limited_source),
536            "python" => self.extract_python_constructs(&limited_source),
537            _ => self.extract_generic_constructs(&limited_source),
538        };
539
540        // Limit number of children to reduce overall size
541        if children.len() > 30 {
542            children.truncate(30);
543        }
544
545        ast.children = children;
546
547        Ok(ast)
548    }
549
550    // Helper to create a minimal AST node from a regex match
551    fn create_ast_node(&self, params: AstNodeParams) -> CodeAST {
552        CodeAST {
553            path: String::new(), // Not relevant for child nodes
554            language: params.language.to_string(),
555            kind: params.kind.to_string(),
556            name: Some(params.capture.to_string()),
557            range: Range {
558                start_row: params.line_num,
559                start_column: params.match_start, // Use match column info
560                end_row: params.line_num,
561                end_column: params.match_end, // Use match column info
562            },
563            children: Vec::new(),
564            // Only include a short preview of the line
565            content: if params.line.len() > 100 {
566                Some(format!("{}...", &params.line[..100]))
567            } else {
568                Some(params.line.to_string())
569            },
570        }
571    }
572
573    // Extract Rust constructs using regex (fallback method)
574    fn extract_rust_constructs(&self, source: &str) -> Vec<CodeAST> {
575        let mut constructs = Vec::new();
576        let lines: Vec<&str> = source.lines().collect();
577
578        // Process each line to find Rust constructs
579        for (line_num, line) in lines.iter().enumerate() {
580            // Check for structs
581            if let Some(captures) = RUST_STRUCT_RE.captures(line) {
582                if let Some(name_match) = captures.get(1) {
583                    constructs.push(self.create_ast_node(AstNodeParams {
584                        language: "rust",
585                        kind: "struct",
586                        line_num,
587                        line,
588                        capture: name_match.as_str(),
589                        match_start: name_match.start(),
590                        match_end: name_match.end(),
591                    }));
592                }
593            }
594
595            // Check for enums
596            if let Some(captures) = RUST_ENUM_RE.captures(line) {
597                if let Some(name_match) = captures.get(1) {
598                    constructs.push(self.create_ast_node(AstNodeParams {
599                        language: "rust",
600                        kind: "enum",
601                        line_num,
602                        line,
603                        capture: name_match.as_str(),
604                        match_start: name_match.start(),
605                        match_end: name_match.end(),
606                    }));
607                }
608            }
609
610            // Check for impls
611            if let Some(captures) = RUST_IMPL_RE.captures(line) {
612                if let Some(name_match) = captures.get(1) {
613                    constructs.push(self.create_ast_node(AstNodeParams {
614                        language: "rust",
615                        kind: "impl",
616                        line_num,
617                        line,
618                        capture: name_match.as_str(),
619                        match_start: name_match.start(),
620                        match_end: name_match.end(),
621                    }));
622                }
623            }
624
625            // Check for functions
626            if let Some(captures) = RUST_FN_RE.captures(line) {
627                if let Some(name_match) = captures.get(1) {
628                    constructs.push(self.create_ast_node(AstNodeParams {
629                        language: "rust",
630                        kind: "function",
631                        line_num,
632                        line,
633                        capture: name_match.as_str(),
634                        match_start: name_match.start(),
635                        match_end: name_match.end(),
636                    }));
637                }
638            }
639
640            // Check for traits
641            if let Some(captures) = RUST_TRAIT_RE.captures(line) {
642                if let Some(name_match) = captures.get(1) {
643                    constructs.push(self.create_ast_node(AstNodeParams {
644                        language: "rust",
645                        kind: "trait",
646                        line_num,
647                        line,
648                        capture: name_match.as_str(),
649                        match_start: name_match.start(),
650                        match_end: name_match.end(),
651                    }));
652                }
653            }
654
655            // Check for modules
656            if let Some(captures) = RUST_MOD_RE.captures(line) {
657                if let Some(name_match) = captures.get(1) {
658                    constructs.push(self.create_ast_node(AstNodeParams {
659                        language: "rust",
660                        kind: "module",
661                        line_num,
662                        line,
663                        capture: name_match.as_str(),
664                        match_start: name_match.start(),
665                        match_end: name_match.end(),
666                    }));
667                }
668            }
669        }
670
671        constructs
672    }
673
674    // Extract JavaScript/TypeScript constructs using regex (fallback method)
675    fn extract_js_constructs(&self, source: &str) -> Vec<CodeAST> {
676        let mut constructs = Vec::new();
677        let lines: Vec<&str> = source.lines().collect();
678
679        // Process each line to find JS/TS constructs
680        for (line_num, line) in lines.iter().enumerate() {
681            // Check for classes
682            if let Some(captures) = JS_CLASS_RE.captures(line) {
683                if let Some(name_match) = captures.get(1) {
684                    constructs.push(self.create_ast_node(AstNodeParams {
685                        language: "javascript",
686                        kind: "class",
687                        line_num,
688                        line,
689                        capture: name_match.as_str(),
690                        match_start: name_match.start(),
691                        match_end: name_match.end(),
692                    }));
693                }
694            }
695
696            // Check for functions
697            if let Some(captures) = JS_FUNCTION_RE.captures(line) {
698                if let Some(name_match) = captures.get(1) {
699                    constructs.push(self.create_ast_node(AstNodeParams {
700                        language: "javascript",
701                        kind: "function",
702                        line_num,
703                        line,
704                        capture: name_match.as_str(),
705                        match_start: name_match.start(),
706                        match_end: name_match.end(),
707                    }));
708                }
709            }
710
711            // Check for arrow functions
712            if let Some(captures) = JS_ARROW_FN_RE.captures(line) {
713                if let Some(name_match) = captures.get(1) {
714                    constructs.push(self.create_ast_node(AstNodeParams {
715                        language: "javascript",
716                        kind: "arrow_function",
717                        line_num,
718                        line,
719                        capture: name_match.as_str(),
720                        match_start: name_match.start(),
721                        match_end: name_match.end(),
722                    }));
723                }
724            }
725
726            // Check for interfaces (TypeScript)
727            if let Some(captures) = JS_INTERFACE_RE.captures(line) {
728                if let Some(name_match) = captures.get(1) {
729                    constructs.push(self.create_ast_node(AstNodeParams {
730                        language: "javascript",
731                        kind: "interface",
732                        line_num,
733                        line,
734                        capture: name_match.as_str(),
735                        match_start: name_match.start(),
736                        match_end: name_match.end(),
737                    }));
738                }
739            }
740
741            // Check for types (TypeScript)
742            if let Some(captures) = JS_TYPE_RE.captures(line) {
743                if let Some(name_match) = captures.get(1) {
744                    constructs.push(self.create_ast_node(AstNodeParams {
745                        language: "javascript",
746                        kind: "type",
747                        line_num,
748                        line,
749                        capture: name_match.as_str(),
750                        match_start: name_match.start(),
751                        match_end: name_match.end(),
752                    }));
753                }
754            }
755        }
756
757        constructs
758    }
759
760    // Extract Python constructs using regex (fallback method)
761    fn extract_python_constructs(&self, source: &str) -> Vec<CodeAST> {
762        let mut constructs = Vec::new();
763        let lines: Vec<&str> = source.lines().collect();
764
765        // Process each line to find Python constructs
766        for (line_num, line) in lines.iter().enumerate() {
767            // Check for classes
768            if let Some(captures) = PY_CLASS_RE.captures(line) {
769                if let Some(name_match) = captures.get(1) {
770                    constructs.push(self.create_ast_node(AstNodeParams {
771                        language: "python",
772                        kind: "class",
773                        line_num,
774                        line,
775                        capture: name_match.as_str(),
776                        match_start: name_match.start(),
777                        match_end: name_match.end(),
778                    }));
779                }
780            }
781
782            // Check for functions
783            if let Some(captures) = PY_FUNCTION_RE.captures(line) {
784                if let Some(name_match) = captures.get(1) {
785                    constructs.push(self.create_ast_node(AstNodeParams {
786                        language: "python",
787                        kind: "function",
788                        line_num,
789                        line,
790                        capture: name_match.as_str(),
791                        match_start: name_match.start(),
792                        match_end: name_match.end(),
793                    }));
794                }
795            }
796
797            // Check for async functions
798            if let Some(captures) = PY_ASYNC_FN_RE.captures(line) {
799                if let Some(name_match) = captures.get(1) {
800                    constructs.push(self.create_ast_node(AstNodeParams {
801                        language: "python",
802                        kind: "async_function",
803                        line_num,
804                        line,
805                        capture: name_match.as_str(),
806                        match_start: name_match.start(),
807                        match_end: name_match.end(),
808                    }));
809                }
810            }
811        }
812
813        constructs
814    }
815
816    // Extract generic code constructs (fallback method)
817    fn extract_generic_constructs(&self, source: &str) -> Vec<CodeAST> {
818        let mut constructs = Vec::new();
819        let lines: Vec<&str> = source.lines().collect();
820
821        // Process each line to find generic code blocks
822        for (line_num, line) in lines.iter().enumerate() {
823            if GENERIC_BLOCK_RE.is_match(line) {
824                constructs.push(CodeAST {
825                    path: String::new(),
826                    language: "generic".to_string(),
827                    kind: "block".to_string(),
828                    name: None,
829                    range: Range {
830                        start_row: line_num,
831                        start_column: 0,
832                        end_row: line_num,
833                        end_column: line.len(),
834                    },
835                    children: Vec::new(),
836                    content: Some(line.to_string()),
837                });
838            }
839        }
840
841        constructs
842    }
843
844    /// Use search tools to find relevant files for a query, with efficiency optimizations
845    fn find_relevant_files(&self, root_dir: &Path, query: &str) -> Result<Vec<PathBuf>> {
846        use crate::tools::fs::search::SearchTools;
847
848        let mut results = Vec::new();
849
850        // Hard limit on number of files to process
851        let max_files = 25; // Reduced from 50 to limit AST size
852
853        // Filter to respect gitignore patterns using the ignore crate
854        let filter_gitignore = |path: &Path| -> bool {
855            // Create a walker that respects gitignore
856            let walker = WalkBuilder::new(path)
857                .hidden(false) // Include hidden files
858                .git_ignore(true) // Respect gitignore
859                .build();
860
861            // If the walker yields this path, it's not ignored
862            walker.flatten().any(|entry| entry.path() == path)
863        };
864
865        // Start with more targeted approach - look for specific files first
866        // Extract specific file mentions from query (like "check file.rs" or "in models.rs")
867        let file_regex =
868            Regex::new(r"(?:file|in|check|view|read)\s+([a-zA-Z0-9_\-\.]+\.[a-zA-Z0-9]+)").unwrap();
869        let mut specific_files = Vec::new();
870
871        for cap in file_regex.captures_iter(query) {
872            if let Some(file_name) = cap.get(1) {
873                specific_files.push(format!("**/{}", file_name.as_str()));
874            }
875        }
876
877        // If specific files were mentioned, prioritize those
878        if !specific_files.is_empty() {
879            for pattern in &specific_files {
880                if let Ok(matches) = SearchTools::glob_search(pattern) {
881                    for path in matches {
882                        if !results.contains(&path) && filter_gitignore(&path) {
883                            results.push(path);
884                            if results.len() >= max_files {
885                                return Ok(results);
886                            }
887                        }
888                    }
889                }
890            }
891        }
892
893        // If specific terms were extracted, try grepping for them
894        let search_terms = self.extract_search_terms(query);
895        if !search_terms.is_empty() {
896            // Limit to top few most specific terms
897            let top_terms: Vec<String> = search_terms.into_iter().take(3).collect();
898
899            for term in top_terms {
900                if let Ok(grep_matches) = SearchTools::grep_search(&term, None, Some(root_dir)) {
901                    // Take only top matches
902                    for (path, _, _) in grep_matches.into_iter().take(5) {
903                        if !results.contains(&path) && filter_gitignore(&path) {
904                            results.push(path);
905                            if results.len() >= max_files {
906                                return Ok(results);
907                            }
908                        }
909                    }
910                }
911            }
912        }
913
914        // If we still need more files, use patterns based on query content
915        if results.len() < max_files {
916            // Get a smaller set of more targeted patterns
917            let patterns = self.determine_relevant_files(query);
918            let targeted_patterns: Vec<&String> = patterns.iter().take(5).collect();
919
920            for pattern in targeted_patterns {
921                if let Ok(matches) = SearchTools::glob_search(pattern) {
922                    for path in matches.into_iter().take(5) {
923                        if !results.contains(&path) && filter_gitignore(&path) {
924                            results.push(path);
925                            if results.len() >= max_files {
926                                return Ok(results);
927                            }
928                        }
929                    }
930                }
931            }
932        }
933
934        // If still not enough, add a few key project files
935        if results.len() < 5 {
936            let key_project_files = vec![
937                "**/lib.rs",
938                "**/main.rs",
939                "**/mod.rs",
940                "**/Cargo.toml",
941                "**/package.json",
942                "**/README.md",
943            ];
944
945            for pattern in key_project_files {
946                if let Ok(matches) = SearchTools::glob_search(pattern) {
947                    for path in matches {
948                        if !results.contains(&path) && filter_gitignore(&path) {
949                            results.push(path);
950                            if results.len() >= max_files {
951                                return Ok(results);
952                            }
953                        }
954                    }
955                }
956            }
957        }
958
959        // Sort results by modification time to prioritize recently changed files
960        results.sort_by(|a, b| {
961            let a_modified = std::fs::metadata(a).and_then(|m| m.modified()).ok();
962            let b_modified = std::fs::metadata(b).and_then(|m| m.modified()).ok();
963            b_modified.cmp(&a_modified)
964        });
965
966        Ok(results)
967    }
968
969    /// Extract search terms from a query for grep search
970    pub fn extract_search_terms(&self, query: &str) -> Vec<String> {
971        let mut terms = Vec::new();
972
973        // Split query into words and look for potential code identifiers
974        let words: Vec<&str> = query
975            .split_whitespace()
976            .filter(|w| w.len() > 3) // Skip short words
977            .collect();
978
979        for word in words {
980            // Clean up the word to extract potential identifiers
981            let clean_word = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '_');
982
983            // Look for identifiers that match coding convention patterns
984            if clean_word.len() > 3
985                && clean_word.chars().all(|c| c.is_alphanumeric() || c == '_')
986                && !clean_word.chars().all(|c| c.is_numeric())
987            {
988                // Skip common English words and programming keywords
989                let common_words = [
990                    "this",
991                    "that",
992                    "from",
993                    "what",
994                    "when",
995                    "where",
996                    "which",
997                    "find",
998                    "function",
999                    "class",
1000                    "struct",
1001                    "impl",
1002                    "type",
1003                    "interface",
1004                    "const",
1005                    "static",
1006                    "public",
1007                    "private",
1008                    "protected",
1009                    "export",
1010                    "import",
1011                ];
1012
1013                if !common_words.contains(&clean_word.to_lowercase().as_str()) {
1014                    terms.push(clean_word.to_string());
1015                }
1016            }
1017        }
1018
1019        terms
1020    }
1021
1022    /// Parse an entire codebase and generate ASTs for selected files
1023    pub fn parse_codebase(&mut self, root_dir: &Path, query: &str) -> Result<Vec<CodeAST>> {
1024        let mut asts = Vec::new();
1025
1026        // Get files relevant to the query
1027        let relevant_files = self.find_relevant_files(root_dir, query)?;
1028
1029        // Parse each file
1030        for path in relevant_files {
1031            if let Ok(ast) = self.parse_file(&path) {
1032                asts.push(ast);
1033            }
1034        }
1035
1036        Ok(asts)
1037    }
1038
1039    /// Generate a concise AST optimized for LLM consumption, respecting API size limits
1040    pub fn generate_llm_friendly_ast(&mut self, root_dir: &Path, query: &str) -> Result<String> {
1041        // Parse the relevant parts of the codebase
1042        let mut asts = self.parse_codebase(root_dir, query)?;
1043
1044        // If no AST data was generated, return a helpful message
1045        if asts.is_empty() {
1046            return Ok(String::from("No relevant code structures found for the query. Try to be more specific about what code you're looking for."));
1047        }
1048
1049        // Sort ASTs by relevance (assuming more recently modified files are more relevant)
1050        asts.sort_by(|a, b| {
1051            let a_path = Path::new(&a.path);
1052            let b_path = Path::new(&b.path);
1053
1054            let a_modified = std::fs::metadata(a_path).and_then(|m| m.modified()).ok();
1055            let b_modified = std::fs::metadata(b_path).and_then(|m| m.modified()).ok();
1056
1057            b_modified.cmp(&a_modified)
1058        });
1059
1060        // Limit to most relevant files (10 max)
1061        if asts.len() > 10 {
1062            asts.truncate(10);
1063        }
1064
1065        // Limit content size within each AST node
1066        for ast in &mut asts {
1067            // Limit child nodes to most important ones (max 20 per file)
1068            if ast.children.len() > 20 {
1069                ast.children.truncate(20);
1070            }
1071
1072            // Truncate content for each child node
1073            for child in &mut ast.children {
1074                if let Some(content) = &child.content {
1075                    if content.len() > 500 {
1076                        let truncated: String = content.chars().take(500).collect();
1077                        child.content = Some(format!("{}... [truncated]", truncated));
1078                    }
1079                }
1080            }
1081        }
1082
1083        // Generate a summary of the AST data
1084        let mut summary = String::new();
1085        summary.push_str(&format!(
1086            "# Code Structure Analysis for Query: \"{}\"\n\n",
1087            query
1088        ));
1089        summary.push_str(&format!(
1090            "Found {} relevant files (showing {} most relevant). Key structures:\n\n",
1091            asts.len(),
1092            asts.len()
1093        ));
1094
1095        // Add a simple text summary of the most important structures
1096        for ast in &asts {
1097            summary.push_str(&format!("## File: {}\n", ast.path));
1098            summary.push_str(&format!("Language: {}\n\n", ast.language));
1099
1100            for child in &ast.children {
1101                let name = child.name.as_deref().unwrap_or("anonymous");
1102                summary.push_str(&format!(
1103                    "- {} `{}` at line {}\n",
1104                    child.kind,
1105                    name,
1106                    child.range.start_row + 1
1107                ));
1108
1109                // Include a short snippet of the content if available
1110                if let Some(content) = &child.content {
1111                    // Only take first line for brevity
1112                    let first_line = content.lines().next().unwrap_or("");
1113                    if !first_line.is_empty() {
1114                        summary.push_str(&format!(
1115                            "   ```{}\n   {}\n   ```\n",
1116                            ast.language, first_line
1117                        ));
1118                    }
1119                }
1120            }
1121
1122            summary.push('\n');
1123        }
1124
1125        // Create a simplified JSON representation with just the essential information
1126        let simplified_asts: Vec<serde_json::Value> = asts
1127            .iter()
1128            .map(|ast| {
1129                let simplified_children: Vec<serde_json::Value> = ast
1130                    .children
1131                    .iter()
1132                    .map(|child| {
1133                        serde_json::json!({
1134                            "kind": child.kind,
1135                            "name": child.name,
1136                            "line": child.range.start_row + 1
1137                        })
1138                    })
1139                    .collect();
1140
1141                serde_json::json!({
1142                    "path": ast.path,
1143                    "language": ast.language,
1144                    "entities": simplified_children
1145                })
1146            })
1147            .collect();
1148
1149        // Add the simplified JSON representation
1150        summary.push_str("\n## Simplified Code Structure:\n\n```json\n");
1151        let simplified_json = serde_json::to_string_pretty(&simplified_asts)
1152            .context("Failed to serialize simplified AST to JSON")?;
1153        summary.push_str(&simplified_json);
1154        summary.push_str("\n```\n");
1155
1156        // Add full AST data in JSON format for more detailed analysis
1157        summary.push_str("\n## Full AST Data (JSON):\n\n```json\n");
1158        let full_json =
1159            serde_json::to_string_pretty(&asts).context("Failed to serialize full AST to JSON")?;
1160        summary.push_str(&full_json);
1161        summary.push_str("\n```\n");
1162
1163        Ok(summary)
1164    }
1165
1166    /// Determine which files to parse based on user query
1167    pub fn determine_relevant_files(&self, query: &str) -> Vec<String> {
1168        let mut patterns = Vec::new();
1169
1170        // Look for specific file mentions in the query
1171        let file_regex = Regex::new(r#"['"]([^'"]+\.\w+)['"]"#).unwrap();
1172        for cap in file_regex.captures_iter(query) {
1173            if let Some(file_match) = cap.get(1) {
1174                let file_pattern = format!("**/{}", file_match.as_str());
1175                patterns.push(file_pattern);
1176            }
1177        }
1178
1179        // Add language-specific patterns based on query keywords
1180        let query_lower = query.to_lowercase();
1181
1182        // Rust patterns
1183        if query_lower.contains("rust") || query_lower.contains(".rs") {
1184            patterns.push("**/*.rs".to_string());
1185            patterns.push("**/src/**/*.rs".to_string());
1186            patterns.push("**/lib.rs".to_string());
1187            patterns.push("**/main.rs".to_string());
1188        }
1189
1190        // JavaScript patterns
1191        if query_lower.contains("javascript")
1192            || query_lower.contains("js")
1193            || query_lower.contains("node")
1194            || query_lower.contains("react")
1195        {
1196            patterns.push("**/*.js".to_string());
1197            patterns.push("**/*.jsx".to_string());
1198            patterns.push("**/src/**/*.js".to_string());
1199            patterns.push("**/src/**/*.jsx".to_string());
1200        }
1201
1202        // TypeScript patterns
1203        if query_lower.contains("typescript")
1204            || query_lower.contains("ts")
1205            || query_lower.contains("angular")
1206            || query_lower.contains("next")
1207        {
1208            patterns.push("**/*.ts".to_string());
1209            patterns.push("**/*.tsx".to_string());
1210            patterns.push("**/src/**/*.ts".to_string());
1211            patterns.push("**/src/**/*.tsx".to_string());
1212        }
1213
1214        // Python patterns
1215        if query_lower.contains("python")
1216            || query_lower.contains("py")
1217            || query_lower.contains("django")
1218            || query_lower.contains("flask")
1219        {
1220            patterns.push("**/*.py".to_string());
1221            patterns.push("**/src/**/*.py".to_string());
1222        }
1223
1224        // Go patterns
1225        if query_lower.contains("go") || query_lower.contains("golang") {
1226            patterns.push("**/*.go".to_string());
1227            patterns.push("**/src/**/*.go".to_string());
1228        }
1229
1230        // C/C++ patterns
1231        if query_lower.contains("c++")
1232            || query_lower.contains("cpp")
1233            || query_lower.contains(" c ")
1234            || query_lower.contains(".c")
1235        {
1236            patterns.push("**/*.c".to_string());
1237            patterns.push("**/*.h".to_string());
1238            patterns.push("**/*.cpp".to_string());
1239            patterns.push("**/*.hpp".to_string());
1240            patterns.push("**/*.cc".to_string());
1241        }
1242
1243        // Java patterns
1244        if query_lower.contains("java") && !query_lower.contains("javascript") {
1245            patterns.push("**/*.java".to_string());
1246            patterns.push("**/src/**/*.java".to_string());
1247        }
1248
1249        // Add patterns for common code directories if no specific language mentioned
1250        if patterns.is_empty() || !patterns.iter().any(|p| p.starts_with("**/src/")) {
1251            patterns.push("**/src/**/*.rs".to_string());
1252            patterns.push("**/src/**/*.ts".to_string());
1253            patterns.push("**/src/**/*.js".to_string());
1254            patterns.push("**/src/**/*.py".to_string());
1255        }
1256
1257        // Always add the language of the codebase (assuming Rust for oli)
1258        if !patterns.iter().any(|p| p.ends_with(".rs")) {
1259            patterns.push("**/*.rs".to_string());
1260        }
1261
1262        patterns
1263    }
1264}