oli_server/tools/code/
parser.rs

1use anyhow::{Context, Result};
2use ignore::WalkBuilder;
3use lazy_static::lazy_static;
4use rayon::prelude::*;
5use serde::{Deserialize, Serialize};
6use std::collections::{HashMap, HashSet};
7use std::fs;
8use std::path::{Path, PathBuf};
9use std::sync::{Arc, Mutex, RwLock};
10use tree_sitter::{Language, Node, Parser, Query, QueryCursor, StreamingIterator, Tree};
11
12/// A representation of code structure that will be sent to the LLM
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct CodeAST {
15    pub path: String,
16    pub language: String,
17    pub kind: String,
18    pub name: Option<String>,
19    pub range: Range,
20    pub children: Vec<CodeAST>,
21    pub content: Option<String>,
22}
23
24/// Represents a source code location range
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct Range {
27    pub start_row: usize,
28    pub start_column: usize,
29    pub end_row: usize,
30    pub end_column: usize,
31}
32
33// Global query definitions for tree-sitter parsing
34lazy_static! {
35    /// PEG-style query for Rust code structures
36    static ref RUST_QUERY: &'static str = r#"
37        ; Struct declarations
38        (struct_item
39            name: (identifier) @struct.name
40            body: (field_declaration_list)? @struct.body) @struct.def
41
42        ; Enum declarations
43        (enum_item
44            name: (identifier) @enum.name
45            body: (enum_variant_list)? @enum.body) @enum.def
46
47        ; Trait declarations
48        (trait_item
49            name: (identifier) @trait.name
50            body: (declaration_list)? @trait.body) @trait.def
51
52        ; Implementations
53        (impl_item
54            trait: (type_identifier)? @impl.trait
55            type: (type_identifier) @impl.type
56            body: (declaration_list)? @impl.body) @impl.def
57
58        ; Functions
59        (function_item
60            name: (identifier) @function.name
61            parameters: (parameters)? @function.params
62            body: (block)? @function.body) @function.def
63
64        ; Modules
65        (mod_item
66            name: (identifier) @module.name
67            body: (declaration_list)? @module.body) @module.def
68
69        ; Constants and statics
70        (const_item
71            name: (identifier) @const.name
72            type: (_) @const.type
73            value: (_) @const.value) @const.def
74
75        (static_item
76            name: (identifier) @static.name
77            type: (_) @static.type
78            value: (_) @static.value) @static.def
79    "#;
80
81    /// PEG-style query for JavaScript/TypeScript code structures
82    static ref JAVASCRIPT_QUERY: &'static str = r#"
83        ; Classes
84        (class_declaration
85            name: (identifier) @class.name
86            body: (class_body)? @class.body) @class.def
87
88        ; Functions
89        (function_declaration
90            name: (identifier) @function.name
91            parameters: (formal_parameters) @function.params
92            body: (statement_block)? @function.body) @function.def
93
94        ; Methods
95        (method_definition
96            name: (property_identifier) @method.name
97            parameters: (formal_parameters) @method.params
98            body: (statement_block)? @method.body) @method.def
99
100        ; Arrow functions in variable declarations
101        (lexical_declaration
102            (variable_declarator
103                name: (identifier) @const.name
104                value: (arrow_function) @const.value)) @const.def
105
106        ; Object pattern in variable declarations
107        (variable_declaration
108            (variable_declarator
109                name: (identifier) @var.name)) @var.def
110
111        ; Interface declarations (TypeScript)
112        (interface_declaration
113            name: (type_identifier) @interface.name
114            body: (object_type)? @interface.body) @interface.def
115
116        ; Type aliases (TypeScript)
117        (type_alias_declaration
118            name: (type_identifier) @type.name
119            value: (_) @type.value) @type.def
120
121        ; Export declarations
122        (export_statement
123            declaration: (_) @export.declaration) @export.def
124    "#;
125
126    /// PEG-style query for Python code structures
127    static ref PYTHON_QUERY: &'static str = r#"
128        ; Classes
129        (class_definition
130            name: (identifier) @class.name
131            body: (block)? @class.body) @class.def
132
133        ; Functions
134        (function_definition
135            name: (identifier) @function.name
136            parameters: (parameters) @function.params
137            body: (block)? @function.body) @function.def
138
139        ; Decorated definitions
140        (decorated_definition
141            definition: (_) @decorated.definition) @decorated.def
142
143        ; Imports
144        (import_statement
145            name: (dotted_name) @import.name) @import.def
146
147        (import_from_statement
148            module_name: (dotted_name) @import_from.module) @import_from.def
149
150        ; Global variables and constants
151        (assignment
152            left: (identifier) @assignment.name
153            right: (_) @assignment.value) @assignment.def
154
155        ; Class attributes
156        (class_definition
157            body: (block
158                (expression_statement
159                    (assignment
160                        left: (identifier) @class_attr.name)))) @class_attr.def
161    "#;
162
163    /// PEG-style query for Go code structures
164    static ref GO_QUERY: &'static str = r#"
165        ; Type declarations
166        (type_declaration
167            (type_spec
168                name: (type_identifier) @type.name
169                type: (_) @type.value)) @type.def
170
171        ; Function declarations
172        (function_declaration
173            name: (identifier) @function.name
174            parameters: (parameter_list) @function.params
175            result: (_)? @function.result
176            body: (block)? @function.body) @function.def
177
178        ; Method declarations
179        (method_declaration
180            name: (field_identifier) @method.name
181            parameters: (parameter_list) @method.params
182            result: (_)? @method.result
183            body: (block)? @method.body) @method.def
184
185        ; Struct type definitions
186        (type_declaration
187            (type_spec
188                name: (type_identifier) @struct.name
189                type: (struct_type) @struct.body)) @struct.def
190
191        ; Interface type definitions
192        (type_declaration
193            (type_spec
194                name: (type_identifier) @interface.name
195                type: (interface_type) @interface.body)) @interface.def
196
197        ; Package clause
198        (package_clause
199            (package_identifier) @package.name) @package.def
200
201        ; Import declarations
202        (import_declaration
203            (import_spec_list) @import.specs) @import.def
204    "#;
205
206    /// PEG-style query for C/C++ code structures
207    static ref CPP_QUERY: &'static str = r#"
208        ; Function definitions
209        (function_definition
210            declarator: (function_declarator
211                declarator: (identifier) @function.name
212                parameters: (parameter_list) @function.params)
213            body: (compound_statement) @function.body) @function.def
214
215        ; Class specifiers
216        (class_specifier
217            name: (type_identifier) @class.name
218            body: (field_declaration_list) @class.body) @class.def
219
220        ; Struct specifiers
221        (struct_specifier
222            name: (type_identifier) @struct.name
223            body: (field_declaration_list) @struct.body) @struct.def
224
225        ; Enum specifiers
226        (enum_specifier
227            name: (type_identifier) @enum.name
228            body: (enumerator_list) @enum.body) @enum.def
229
230        ; Namespace definitions
231        (namespace_definition
232            name: (identifier) @namespace.name
233            body: (declaration_list) @namespace.body) @namespace.def
234
235        ; Template declarations
236        (template_declaration
237            parameters: (template_parameter_list) @template.params
238            declaration: (_) @template.declaration) @template.def
239
240        ; Variable declarations
241        (declaration
242            declarator: (init_declarator
243                declarator: (identifier) @var.name)) @var.def
244
245        ; Method definitions
246        (function_definition
247            declarator: (function_declarator
248                declarator: (field_identifier) @method.name
249                parameters: (parameter_list) @method.params)
250            body: (compound_statement) @method.body) @method.def
251    "#;
252
253    /// PEG-style query for Java code structures
254    static ref JAVA_QUERY: &'static str = r#"
255        ; Class declarations
256        (class_declaration
257            name: (identifier) @class.name
258            body: (class_body) @class.body) @class.def
259
260        ; Method declarations
261        (method_declaration
262            name: (identifier) @method.name
263            parameters: (formal_parameters) @method.params
264            body: (block)? @method.body) @method.def
265
266        ; Interface declarations
267        (interface_declaration
268            name: (identifier) @interface.name
269            body: (interface_body) @interface.body) @interface.def
270
271        ; Constructor declarations
272        (constructor_declaration
273            name: (identifier) @constructor.name
274            parameters: (formal_parameters) @constructor.params
275            body: (constructor_body) @constructor.body) @constructor.def
276
277        ; Field declarations
278        (field_declaration
279            declarator: (variable_declarator
280                name: (identifier) @field.name)) @field.def
281
282        ; Package declarations
283        (package_declaration
284            name: (scoped_identifier) @package.name) @package.def
285
286        ; Import declarations
287        (import_declaration
288            name: (scoped_identifier) @import.name) @import.def
289
290        ; Annotation declarations
291        (annotation_type_declaration
292            name: (identifier) @annotation.name
293            body: (annotation_type_body) @annotation.body) @annotation.def
294    "#;
295
296    // Cache for parsers and languages
297    static ref LANGUAGE_CACHE: Arc<RwLock<HashMap<String, Language>>> = Arc::new(RwLock::new(HashMap::new()));
298    static ref PARSER_CACHE: Arc<Mutex<HashMap<String, Parser>>> = Arc::new(Mutex::new(HashMap::new()));
299    static ref QUERY_CACHE: Arc<RwLock<HashMap<String, String>>> = Arc::new(RwLock::new(HashMap::new()));
300    static ref TREE_CACHE: Arc<RwLock<HashMap<PathBuf, (Tree, String)>>> = Arc::new(RwLock::new(HashMap::new()));
301}
302
303/// A robust code parser system that analyzes source code and produces
304/// clean, accurate Abstract Syntax Trees (ASTs) optimized for LLM consumption.
305///
306/// # Key capabilities:
307/// - Consistent parsing approach using tree-sitter for reliable, accurate parsing
308/// - Clean, well-documented API for LLM tool use
309/// - Efficient error recovery for handling malformed code
310/// - Structured AST output that LLMs can easily interpret
311/// - Language detection with robust extension mapping
312/// - Declarative query patterns for extracting meaningful code structures
313/// - Efficient caching system for parsers and queries
314pub struct CodeParser {
315    /// Maps language names to file extensions
316    languages: HashMap<String, Vec<String>>,
317    /// Default parser instance for initial parsing
318    parser: Parser,
319    /// Cache size limit for AST trees (in bytes)
320    cache_size_limit: usize,
321    /// Maximum file size to parse in bytes (default: 1MB)
322    max_file_size: usize,
323    /// Maximum number of files to parse in a codebase (default: 25)
324    max_files: usize,
325    /// Maximum recursion depth for nested structures (default: 3)
326    max_depth: usize,
327}
328
329impl CodeParser {
330    /// Creates a new CodeParser instance with initialized language support
331    /// and default configuration.
332    ///
333    /// # Returns
334    /// - `Result<Self>` - A new CodeParser instance or an error
335    pub fn new() -> Result<Self> {
336        Self::with_config(None, None, None, None)
337    }
338
339    /// Creates a new CodeParser instance with custom configuration.
340    ///
341    /// # Arguments
342    /// - `cache_size_limit` - Optional cache size limit in bytes (default: 50MB)
343    /// - `max_file_size` - Optional maximum file size to parse in bytes (default: 1MB)
344    /// - `max_files` - Optional maximum number of files to parse (default: 25)
345    /// - `max_depth` - Optional maximum recursion depth (default: 3)
346    ///
347    /// # Returns
348    /// - `Result<Self>` - A new CodeParser instance or an error
349    pub fn with_config(
350        cache_size_limit: Option<usize>,
351        max_file_size: Option<usize>,
352        max_files: Option<usize>,
353        max_depth: Option<usize>,
354    ) -> Result<Self> {
355        let mut languages = HashMap::new();
356
357        // Define supported languages with their file extensions
358        languages.insert("rust".to_string(), vec!["rs".to_string()]);
359        languages.insert(
360            "javascript".to_string(),
361            vec!["js".to_string(), "jsx".to_string()],
362        );
363        languages.insert(
364            "typescript".to_string(),
365            vec!["ts".to_string(), "tsx".to_string()],
366        );
367        languages.insert("python".to_string(), vec!["py".to_string()]);
368        languages.insert("go".to_string(), vec!["go".to_string()]);
369        languages.insert("c".to_string(), vec!["c".to_string(), "h".to_string()]);
370        languages.insert(
371            "cpp".to_string(),
372            vec![
373                "cpp".to_string(),
374                "cc".to_string(),
375                "cxx".to_string(),
376                "hpp".to_string(),
377                "hxx".to_string(),
378            ],
379        );
380        languages.insert("java".to_string(), vec!["java".to_string()]);
381
382        // Initialize parser
383        let parser = Parser::new();
384
385        // Initialize language cache with known languages
386        {
387            let mut cache = LANGUAGE_CACHE.write().unwrap();
388            if cache.is_empty() {
389                // Load languages with the tree-sitter bindings
390                let rust_lang: Language = tree_sitter_rust::LANGUAGE.into();
391                cache.insert("rust".to_string(), rust_lang);
392
393                let js_lang: Language = tree_sitter_javascript::LANGUAGE.into();
394                cache.insert("javascript".to_string(), js_lang.clone());
395                cache.insert("typescript".to_string(), js_lang); // TypeScript uses JS grammar for basic parsing
396
397                let py_lang: Language = tree_sitter_python::LANGUAGE.into();
398                cache.insert("python".to_string(), py_lang);
399
400                let c_lang: Language = tree_sitter_c::LANGUAGE.into();
401                cache.insert("c".to_string(), c_lang);
402
403                let cpp_lang: Language = tree_sitter_cpp::LANGUAGE.into();
404                cache.insert("cpp".to_string(), cpp_lang);
405
406                let go_lang: Language = tree_sitter_go::LANGUAGE.into();
407                cache.insert("go".to_string(), go_lang);
408
409                let java_lang: Language = tree_sitter_java::LANGUAGE.into();
410                cache.insert("java".to_string(), java_lang);
411            }
412        }
413
414        // Initialize parser cache
415        {
416            let mut cache = PARSER_CACHE.lock().unwrap();
417            if cache.is_empty() {
418                for lang_name in languages.keys() {
419                    let mut new_parser = Parser::new();
420                    if let Some(lang) = LANGUAGE_CACHE.read().unwrap().get(lang_name) {
421                        if new_parser.set_language(lang).is_ok() {
422                            cache.insert(lang_name.clone(), new_parser);
423                        }
424                    }
425                }
426            }
427        }
428
429        // Initialize query cache with known language queries
430        {
431            let mut cache = QUERY_CACHE.write().unwrap();
432            if cache.is_empty() {
433                cache.insert("rust".to_string(), RUST_QUERY.to_string());
434                cache.insert("javascript".to_string(), JAVASCRIPT_QUERY.to_string());
435                cache.insert("typescript".to_string(), JAVASCRIPT_QUERY.to_string());
436                cache.insert("python".to_string(), PYTHON_QUERY.to_string());
437                cache.insert("go".to_string(), GO_QUERY.to_string());
438                cache.insert("c".to_string(), CPP_QUERY.to_string());
439                cache.insert("cpp".to_string(), CPP_QUERY.to_string());
440                cache.insert("java".to_string(), JAVA_QUERY.to_string());
441            }
442        }
443
444        // Set defaults or use provided values
445        let cache_size_limit = cache_size_limit.unwrap_or(50 * 1024 * 1024); // 50MB cache
446        let max_file_size = max_file_size.unwrap_or(1_000_000); // 1MB max file size
447        let max_files = max_files.unwrap_or(25); // Maximum files to parse
448        let max_depth = max_depth.unwrap_or(3); // Maximum recursion depth
449
450        Ok(Self {
451            languages,
452            parser,
453            cache_size_limit,
454            max_file_size,
455            max_files,
456            max_depth,
457        })
458    }
459
460    /// Gets a tree-sitter language for parsing
461    ///
462    /// # Arguments
463    /// - `language_name` - Name of the language to retrieve
464    ///
465    /// # Returns
466    /// - `Option<Language>` - The tree-sitter language if available
467    fn get_language(&self, language_name: &str) -> Option<Language> {
468        LANGUAGE_CACHE.read().unwrap().get(language_name).cloned()
469    }
470
471    // Note: get_parser method removed as it was unused
472
473    /// Gets a tree-sitter query for a language
474    ///
475    /// # Arguments
476    /// - `language_name` - Name of the language to get a query for
477    ///
478    /// # Returns
479    /// - `Option<Query>` - A tree-sitter query if available
480    fn get_query(&self, language_name: &str) -> Option<Result<Query>> {
481        let query_cache = QUERY_CACHE.read().unwrap();
482        let query_string = query_cache.get(language_name)?;
483
484        if let Some(lang) = self.get_language(language_name) {
485            match Query::new(&lang, query_string) {
486                Ok(query) => Some(Ok(query)),
487                Err(e) => Some(Err(anyhow::anyhow!("Failed to create query: {:?}", e))),
488            }
489        } else {
490            None
491        }
492    }
493
494    /// Determines the programming language from a file extension
495    ///
496    /// # Arguments
497    /// - `path` - Path to the file
498    ///
499    /// # Returns
500    /// - `Option<String>` - Language name if detected
501    pub fn detect_language(&self, path: &Path) -> Option<String> {
502        let extension = path.extension()?.to_str()?.to_lowercase();
503
504        // Special handling for TypeScript/JavaScript
505        if extension == "ts" || extension == "tsx" {
506            return Some("typescript".to_string());
507        } else if extension == "js" || extension == "jsx" {
508            return Some("javascript".to_string());
509        }
510
511        // General language detection
512        for (lang, extensions) in &self.languages {
513            if extensions.iter().any(|ext| ext == &extension) {
514                return Some(lang.clone());
515            }
516        }
517
518        None
519    }
520
521    /// Parses a single file using tree-sitter and generates an AST
522    ///
523    /// # Arguments
524    /// - `path` - Path to the file to parse
525    ///
526    /// # Returns
527    /// - `Result<CodeAST>` - The abstract syntax tree or an error
528    pub fn parse_file(&mut self, path: &Path) -> Result<CodeAST> {
529        // Detect language
530        let language_name = self
531            .detect_language(path)
532            .context(format!("Could not detect language for file: {:?}", path))?;
533
534        // Read file content - limit file size for very large files
535        let metadata = fs::metadata(path)?;
536
537        // Skip files larger than the max file size to avoid processing too much data
538        if metadata.len() > self.max_file_size as u64 {
539            return Ok(CodeAST {
540                path: path.to_string_lossy().to_string(),
541                language: language_name.to_string(),
542                kind: "file".to_string(),
543                name: path
544                    .file_name()
545                    .and_then(|n| n.to_str())
546                    .map(|s| s.to_string()),
547                range: Range {
548                    start_row: 0,
549                    start_column: 0,
550                    end_row: 0,
551                    end_column: 0,
552                },
553                children: vec![CodeAST {
554                    path: String::new(),
555                    language: language_name.to_string(),
556                    kind: "large_file".to_string(),
557                    name: Some("File too large for AST generation".to_string()),
558                    range: Range {
559                        start_row: 0,
560                        start_column: 0,
561                        end_row: 0,
562                        end_column: 0,
563                    },
564                    children: Vec::new(),
565                    content: Some(format!(
566                        "File size: {} bytes - too large for detailed parsing",
567                        metadata.len()
568                    )),
569                }],
570                content: None,
571            });
572        }
573
574        // Read file content
575        let source_code = fs::read_to_string(path)?;
576
577        // Create the base AST node for the file
578        let mut ast = CodeAST {
579            path: path.to_string_lossy().to_string(),
580            language: language_name.to_string(),
581            kind: "file".to_string(),
582            name: path
583                .file_name()
584                .and_then(|n| n.to_str())
585                .map(|s| s.to_string()),
586            range: Range {
587                start_row: 0,
588                start_column: 0,
589                end_row: source_code.lines().count(),
590                end_column: 0,
591            },
592            children: Vec::new(),
593            content: None,
594        };
595
596        // Try to use tree-sitter for parsing
597        if let Some(language) = self.get_language(&language_name) {
598            // Check tree cache first
599            let path_buf = path.to_path_buf();
600            let tree_option = {
601                let cache = TREE_CACHE.read().unwrap();
602                if let Some((tree, content)) = cache.get(&path_buf) {
603                    if content == &source_code {
604                        Some(tree.clone())
605                    } else {
606                        None
607                    }
608                } else {
609                    None
610                }
611            };
612
613            // If tree is not in cache, parse it
614            let tree = if let Some(cached_tree) = tree_option {
615                cached_tree
616            } else {
617                // Configure parser
618                self.parser.set_language(&language)?;
619
620                // Parse with error recovery
621                let tree = self
622                    .parser
623                    .parse(&source_code, None)
624                    .context("Failed to parse source code")?;
625
626                // Store in cache
627                {
628                    let mut cache = TREE_CACHE.write().unwrap();
629
630                    // Check if we need to evict some entries to stay within the cache size limit
631                    let current_size: usize =
632                        cache.iter().map(|(_, (_, content))| content.len()).sum();
633
634                    if current_size + source_code.len() > self.cache_size_limit {
635                        // Simple LRU eviction: remove oldest entries first
636                        let mut keys_to_remove = Vec::new();
637                        let mut entries: Vec<_> = cache.iter().collect();
638                        entries.sort_by_key(|(_, (_, content))| content.len());
639
640                        let mut freed_size = 0;
641                        let needed_size = source_code.len();
642
643                        for (path, (_, content)) in entries {
644                            if current_size + needed_size - freed_size <= self.cache_size_limit {
645                                break;
646                            }
647
648                            freed_size += content.len();
649                            keys_to_remove.push(path.clone());
650                        }
651
652                        // Remove entries after we're done iterating
653                        for path in keys_to_remove {
654                            cache.remove(&path);
655                        }
656                    }
657
658                    cache.insert(path_buf.clone(), (tree.clone(), source_code.clone()));
659                }
660
661                tree
662            };
663
664            // Try to use tree-sitter queries to extract structured information
665            if let Some(Ok(query)) = self.get_query(&language_name) {
666                // Use tree-sitter query to extract structured information
667                let root_node = tree.root_node();
668                let mut query_cursor = QueryCursor::new();
669
670                // Extract nodes based on the query
671                let mut matches = query_cursor.matches(&query, root_node, source_code.as_bytes());
672
673                // Process matches to extract AST nodes
674                while let Some(match_item) = matches.next() {
675                    let mut node_data: HashMap<String, (Node, String)> = HashMap::new();
676
677                    // Extract each capture data
678                    for capture in match_item.captures {
679                        // Get the capture name
680                        let capture_name = &query.capture_names()[capture.index as usize];
681                        let node_text = capture
682                            .node
683                            .utf8_text(source_code.as_bytes())
684                            .unwrap_or("<unknown>");
685
686                        node_data.insert(
687                            capture_name.to_string(),
688                            (capture.node, node_text.to_string()),
689                        );
690                    }
691
692                    // Find definition nodes (with .def suffix)
693                    let def_entries: Vec<_> = node_data
694                        .iter()
695                        .filter(|(name, _)| name.ends_with(".def"))
696                        .collect();
697
698                    if !def_entries.is_empty() {
699                        let (def_name, (def_node, _)) = def_entries[0];
700                        let def_type = def_name.split('.').next().unwrap_or("unknown");
701                        let start_pos = def_node.start_position();
702                        let end_pos = def_node.end_position();
703
704                        // Find name nodes (with .name suffix)
705                        let name_entries: Vec<_> = node_data
706                            .iter()
707                            .filter(|(name, _)| name.ends_with(".name"))
708                            .collect();
709
710                        let name = if !name_entries.is_empty() {
711                            let (_, (_, name_text)) = name_entries[0];
712                            Some(name_text.clone())
713                        } else {
714                            None
715                        };
716
717                        // Extract body content if available
718                        let body_entries: Vec<_> = node_data
719                            .iter()
720                            .filter(|(name, _)| name.ends_with(".body"))
721                            .collect();
722
723                        let content = if !body_entries.is_empty() {
724                            let (_, (body_node, _)) = body_entries[0];
725                            body_node
726                                .utf8_text(source_code.as_bytes())
727                                .ok()
728                                .map(|s| s.to_string())
729                        } else {
730                            def_node
731                                .utf8_text(source_code.as_bytes())
732                                .ok()
733                                .map(|s| s.to_string())
734                        };
735
736                        // Create child AST node
737                        let mut child_ast = CodeAST {
738                            path: String::new(),
739                            language: language_name.to_string(),
740                            kind: def_type.to_string(),
741                            name,
742                            range: Range {
743                                start_row: start_pos.row,
744                                start_column: start_pos.column,
745                                end_row: end_pos.row,
746                                end_column: end_pos.column,
747                            },
748                            children: Vec::new(),
749                            content: content.map(|s| {
750                                // Truncate content if it's too large
751                                if s.len() > 1000 {
752                                    format!("{}...", &s[..1000])
753                                } else {
754                                    s
755                                }
756                            }),
757                        };
758
759                        // Extract nested structures (for hierarchical AST)
760                        self.extract_nested_structures(
761                            &source_code,
762                            *def_node,
763                            &mut child_ast,
764                            &language_name,
765                            self.max_depth, // Use configured recursion depth
766                        );
767
768                        ast.children.push(child_ast);
769                    }
770                }
771
772                // If tree-sitter query found structures, return the AST
773                if !ast.children.is_empty() {
774                    return Ok(ast);
775                }
776            }
777
778            // If structured query didn't work, fallback to node traversal
779            let tree_node = tree.root_node();
780            let node_children =
781                self.extract_important_nodes(tree_node, &source_code, &language_name);
782
783            if !node_children.is_empty() {
784                ast.children = node_children;
785                return Ok(ast);
786            }
787        }
788
789        // If tree-sitter couldn't produce useful results, use simplified extraction
790        self.create_simplified_ast(path, &language_name, &source_code)
791    }
792
793    /// Extracts nested structures from a node to build a hierarchical AST
794    ///
795    /// # Arguments
796    /// - `source_code` - Source code of the file
797    /// - `node` - Current node to process
798    /// - `parent_ast` - Parent AST node to add children to
799    /// - `language` - Language of the source code
800    /// - `depth` - Recursion depth limit
801    fn extract_nested_structures(
802        &self,
803        source_code: &str,
804        node: Node,
805        parent_ast: &mut CodeAST,
806        language: &str,
807        depth: usize,
808    ) {
809        if depth == 0 {
810            return;
811        }
812
813        // Skip if the node is too small
814        if node.end_byte() - node.start_byte() < 10 {
815            return;
816        }
817
818        let mut cursor = node.walk();
819
820        // Get nested defined structures based on language
821        let important_node_types = Self::get_important_node_types(language);
822
823        // Process child nodes
824        for child in node.children(&mut cursor) {
825            let kind = child.kind();
826
827            // Skip insignificant nodes
828            if kind == "(" || kind == ")" || kind == "{" || kind == "}" || kind == ";" {
829                continue;
830            }
831
832            // Process important child nodes
833            if important_node_types.contains(&kind) {
834                let start_pos = child.start_position();
835                let end_pos = child.end_position();
836
837                // Try to find a name for this node
838                let name = self.extract_node_name(&child, source_code);
839
840                // Get content truncated for brevity
841                let content = child.utf8_text(source_code.as_bytes()).ok().map(|s| {
842                    // Truncate content if it's too large
843                    if s.len() > 500 {
844                        format!("{}...", &s[..500])
845                    } else {
846                        s.to_string()
847                    }
848                });
849
850                // Create child AST node
851                let mut child_ast = CodeAST {
852                    path: String::new(),
853                    language: language.to_string(),
854                    kind: kind.to_string(),
855                    name,
856                    range: Range {
857                        start_row: start_pos.row,
858                        start_column: start_pos.column,
859                        end_row: end_pos.row,
860                        end_column: end_pos.column,
861                    },
862                    children: Vec::new(),
863                    content,
864                };
865
866                // Recursively extract nested structures
867                self.extract_nested_structures(
868                    source_code,
869                    child,
870                    &mut child_ast,
871                    language,
872                    depth - 1,
873                );
874
875                parent_ast.children.push(child_ast);
876            }
877        }
878    }
879
880    /// Extracts a name from a node based on common patterns
881    ///
882    /// # Arguments
883    /// - `node` - Node to extract name from
884    /// - `source` - Source code
885    ///
886    /// # Returns
887    /// - `Option<String>` - Extracted name if found
888    fn extract_node_name(&self, node: &Node, source: &str) -> Option<String> {
889        let mut cursor = node.walk();
890
891        // Look for identifier nodes that might contain the name
892        for child in node.children(&mut cursor) {
893            if child.kind() == "identifier"
894                || child.kind() == "type_identifier"
895                || child.kind() == "field_identifier"
896                || child.kind() == "property_identifier"
897            {
898                if let Ok(text) = child.utf8_text(source.as_bytes()) {
899                    return Some(text.to_string());
900                }
901            }
902        }
903
904        None
905    }
906
907    /// Gets a list of important node types for a given language
908    ///
909    /// # Arguments
910    /// - `language` - Language to get node types for
911    ///
912    /// # Returns
913    /// - `&[&str]` - Array of important node type names
914    fn get_important_node_types(language: &str) -> &'static [&'static str] {
915        match language {
916            "rust" => &[
917                "struct_item",
918                "enum_item",
919                "impl_item",
920                "function_item",
921                "trait_item",
922                "mod_item",
923                "macro_definition",
924                "const_item",
925                "static_item",
926            ],
927            "javascript" | "typescript" => &[
928                "class_declaration",
929                "function_declaration",
930                "method_definition",
931                "lexical_declaration",
932                "interface_declaration",
933                "type_alias_declaration",
934                "export_statement",
935                "variable_declaration",
936            ],
937            "python" => &[
938                "class_definition",
939                "function_definition",
940                "decorated_definition",
941                "import_statement",
942                "import_from_statement",
943                "assignment",
944            ],
945            "go" => &[
946                "function_declaration",
947                "method_declaration",
948                "type_declaration",
949                "struct_type",
950                "interface_type",
951                "package_clause",
952                "import_declaration",
953            ],
954            "c" | "cpp" => &[
955                "function_definition",
956                "class_specifier",
957                "struct_specifier",
958                "enum_specifier",
959                "namespace_definition",
960                "template_declaration",
961                "declaration",
962            ],
963            "java" => &[
964                "class_declaration",
965                "method_declaration",
966                "interface_declaration",
967                "constructor_declaration",
968                "field_declaration",
969                "package_declaration",
970                "import_declaration",
971                "annotation_type_declaration",
972            ],
973            _ => &[],
974        }
975    }
976
977    /// Extract important nodes from a tree-sitter syntax tree using generic traversal
978    ///
979    /// # Arguments
980    /// - `node` - Root node to traverse
981    /// - `source` - Source code text
982    /// - `language` - Language of the source code
983    ///
984    /// # Returns
985    /// - `Vec<CodeAST>` - List of extracted AST nodes
986    fn extract_important_nodes(
987        &self,
988        node: Node<'_>,
989        source: &str,
990        language: &str,
991    ) -> Vec<CodeAST> {
992        let mut result = Vec::new();
993        let important_node_types = Self::get_important_node_types(language);
994
995        // Check if this node is important
996        if important_node_types.contains(&node.kind()) {
997            self.process_important_node(node, source, language, &mut result);
998        }
999
1000        // Recursively process child nodes
1001        let mut cursor = node.walk();
1002        for child in node.children(&mut cursor) {
1003            // Skip tokens and trivial nodes
1004            if child.child_count() > 0 && child.is_named() {
1005                let child_results = self.extract_important_nodes(child, source, language);
1006                result.extend(child_results);
1007            }
1008        }
1009
1010        result
1011    }
1012
1013    /// Process an individual node that has been identified as important
1014    ///
1015    /// # Arguments
1016    /// - `node` - Node to process
1017    /// - `source` - Source code text
1018    /// - `language` - Language of the source code
1019    /// - `result` - Vector to add processed nodes to
1020    fn process_important_node(
1021        &self,
1022        node: Node<'_>,
1023        source: &str,
1024        language: &str,
1025        result: &mut Vec<CodeAST>,
1026    ) {
1027        // Try to find a name for this node
1028        let name = self.extract_node_name(&node, source);
1029
1030        // Extract content (full node text for better context)
1031        let content = node.utf8_text(source.as_bytes()).ok().map(|s| {
1032            // Truncate content if it's too large
1033            if s.len() > 500 {
1034                format!("{}...", &s[..500])
1035            } else {
1036                s.to_string()
1037            }
1038        });
1039
1040        // Create a minimal AST node for this important node
1041        let ast_node = CodeAST {
1042            path: String::new(),
1043            language: language.to_string(),
1044            kind: node.kind().to_string(),
1045            name,
1046            range: Range {
1047                start_row: node.start_position().row,
1048                start_column: node.start_position().column,
1049                end_row: node.end_position().row,
1050                end_column: node.end_position().column,
1051            },
1052            children: Vec::new(),
1053            content,
1054        };
1055
1056        result.push(ast_node);
1057    }
1058
1059    /// Create a simplified AST directly from the source code
1060    /// This is a fallback method when tree-sitter parsing doesn't work
1061    ///
1062    /// # Arguments
1063    /// - `path` - Path to the file
1064    /// - `language` - Language of the source code
1065    /// - `source_code` - Source code text
1066    ///
1067    /// # Returns
1068    /// - `Result<CodeAST>` - Simplified AST or error
1069    pub fn create_simplified_ast(
1070        &self,
1071        path: &Path,
1072        language: &str,
1073        source_code: &str,
1074    ) -> Result<CodeAST> {
1075        // Limit input size for processing
1076        let limited_source = if source_code.len() > 50_000 {
1077            // Only process first ~50KB for efficiency
1078            let truncated: String = source_code.chars().take(50_000).collect();
1079            truncated
1080        } else {
1081            source_code.to_string()
1082        };
1083
1084        let lines: Vec<&str> = limited_source.lines().collect();
1085
1086        // Create basic AST structure
1087        let mut ast = CodeAST {
1088            path: path.to_string_lossy().to_string(),
1089            language: language.to_string(),
1090            kind: "file".to_string(),
1091            name: path
1092                .file_name()
1093                .and_then(|n| n.to_str())
1094                .map(|s| s.to_string()),
1095            range: Range {
1096                start_row: 0,
1097                start_column: 0,
1098                end_row: lines.len(),
1099                end_column: 0,
1100            },
1101            children: Vec::new(),
1102            content: None,
1103        };
1104
1105        // Extract code blocks and declarations with line numbers
1106        for (line_num, line) in lines.iter().enumerate() {
1107            let trimmed = line.trim();
1108
1109            // Skip empty lines and simple statements
1110            if trimmed.is_empty() || (trimmed.len() < 5 && !trimmed.contains('{')) {
1111                continue;
1112            }
1113
1114            // Identify potential code blocks and declarations by common patterns
1115            if trimmed.contains(" fn ")
1116                || trimmed.contains("func ")
1117                || trimmed.contains(" class ")
1118                || trimmed.contains(" struct ")
1119                || trimmed.contains(" trait ")
1120                || trimmed.contains(" impl ")
1121                || trimmed.contains(" interface ")
1122                || trimmed.contains(" def ")
1123                || trimmed.contains(" type ")
1124                || trimmed.starts_with("fn ")
1125                || trimmed.starts_with("class ")
1126                || trimmed.starts_with("struct ")
1127                || trimmed.starts_with("trait ")
1128                || trimmed.starts_with("impl ")
1129                || trimmed.starts_with("interface ")
1130                || trimmed.starts_with("def ")
1131                || trimmed.starts_with("type ")
1132                || trimmed.starts_with("function ")
1133                || trimmed.starts_with("async ")
1134            {
1135                // Determine the kind of node
1136                let kind = if trimmed.contains(" fn ")
1137                    || trimmed.contains("func ")
1138                    || trimmed.starts_with("fn ")
1139                    || trimmed.contains(" def ")
1140                    || trimmed.starts_with("def ")
1141                    || trimmed.starts_with("function ")
1142                    || trimmed.contains("async ")
1143                {
1144                    "function"
1145                } else if trimmed.contains(" class ") || trimmed.starts_with("class ") {
1146                    "class"
1147                } else if trimmed.contains(" struct ") || trimmed.starts_with("struct ") {
1148                    "struct"
1149                } else if trimmed.contains(" trait ") || trimmed.starts_with("trait ") {
1150                    "trait"
1151                } else if trimmed.contains(" impl ") || trimmed.starts_with("impl ") {
1152                    "impl"
1153                } else if trimmed.contains(" interface ") || trimmed.starts_with("interface ") {
1154                    "interface"
1155                } else if trimmed.contains(" type ") || trimmed.starts_with("type ") {
1156                    "type"
1157                } else {
1158                    "block"
1159                };
1160
1161                // Extract a simple name from the line by splitting on spaces and symbols
1162                let words: Vec<&str> = trimmed.split_whitespace().collect();
1163                let mut name = None;
1164
1165                // Try to find a name based on the kind (position after keyword)
1166                if words.len() > 1 {
1167                    let name_word_idx = match kind {
1168                        "function" => {
1169                            if trimmed.contains(" fn ") {
1170                                words.iter().position(|&w| w == "fn").map(|p| p + 1)
1171                            } else if trimmed.contains(" def ") {
1172                                words.iter().position(|&w| w == "def").map(|p| p + 1)
1173                            } else if trimmed.contains("func ") {
1174                                words.iter().position(|&w| w == "func").map(|p| p + 1)
1175                            } else if trimmed.contains(" function ") {
1176                                words.iter().position(|&w| w == "function").map(|p| p + 1)
1177                            } else {
1178                                Some(1) // Assume name is the second word
1179                            }
1180                        }
1181                        "class" => words.iter().position(|&w| w == "class").map(|p| p + 1),
1182                        "struct" => words.iter().position(|&w| w == "struct").map(|p| p + 1),
1183                        "trait" => words.iter().position(|&w| w == "trait").map(|p| p + 1),
1184                        "impl" => words.iter().position(|&w| w == "impl").map(|p| p + 1),
1185                        "interface" => words.iter().position(|&w| w == "interface").map(|p| p + 1),
1186                        "type" => words.iter().position(|&w| w == "type").map(|p| p + 1),
1187                        _ => Some(1),
1188                    };
1189
1190                    if let Some(idx) = name_word_idx {
1191                        if idx < words.len() {
1192                            // Clean up the name (remove trailing colons, brackets, etc.)
1193                            name = Some(
1194                                words[idx]
1195                                    .trim_end_matches(|c| ",:;<>(){}".contains(c))
1196                                    .to_string(),
1197                            );
1198                        }
1199                    }
1200                }
1201
1202                // Create AST node for this code construct
1203                let ast_node = CodeAST {
1204                    path: String::new(),
1205                    language: language.to_string(),
1206                    kind: kind.to_string(),
1207                    name,
1208                    range: Range {
1209                        start_row: line_num,
1210                        start_column: 0,
1211                        end_row: line_num,
1212                        end_column: line.len(),
1213                    },
1214                    children: Vec::new(),
1215                    content: Some(line.to_string()),
1216                };
1217
1218                ast.children.push(ast_node);
1219            }
1220        }
1221
1222        // Limit number of children to reduce overall size
1223        if ast.children.len() > 30 {
1224            ast.children.truncate(30);
1225        }
1226
1227        Ok(ast)
1228    }
1229
1230    /// Use search tools to find relevant files for a query
1231    ///
1232    /// # Arguments
1233    /// - `root_dir` - Root directory to search in
1234    /// - `query` - User query to determine relevant files
1235    ///
1236    /// # Returns
1237    /// - `Result<Vec<PathBuf>>` - List of relevant file paths
1238    fn find_relevant_files(&self, root_dir: &Path, query: &str) -> Result<Vec<PathBuf>> {
1239        use crate::tools::fs::search::SearchTools;
1240
1241        let mut results = Vec::new();
1242
1243        // Use configured limit on number of files to process
1244        let max_files = self.max_files;
1245
1246        // Filter to respect gitignore patterns using the ignore crate
1247        let filter_gitignore = |path: &Path| -> bool {
1248            // Create a walker that respects gitignore
1249            let walker = WalkBuilder::new(path)
1250                .hidden(false) // Include hidden files
1251                .git_ignore(true) // Respect gitignore
1252                .build();
1253
1254            // If the walker yields this path, it's not ignored
1255            walker.flatten().any(|entry| entry.path() == path)
1256        };
1257
1258        // Start with more targeted approach - look for specific files first
1259        // Extract specific file mentions from query (like "check file.rs" or "in models.rs")
1260        let file_regex =
1261            regex::Regex::new(r"(?:file|in|check|view|read)\s+([a-zA-Z0-9_\-\.]+\.[a-zA-Z0-9]+)")
1262                .unwrap();
1263        let mut specific_files = Vec::new();
1264
1265        for cap in file_regex.captures_iter(query) {
1266            if let Some(file_name) = cap.get(1) {
1267                specific_files.push(format!("**/{}", file_name.as_str()));
1268            }
1269        }
1270
1271        // If specific files were mentioned, prioritize those
1272        if !specific_files.is_empty() {
1273            for pattern in &specific_files {
1274                if let Ok(matches) = SearchTools::glob_search(pattern) {
1275                    for path in matches {
1276                        if !results.contains(&path) && filter_gitignore(&path) {
1277                            results.push(path);
1278                            if results.len() >= max_files {
1279                                return Ok(results);
1280                            }
1281                        }
1282                    }
1283                }
1284            }
1285        }
1286
1287        // If specific terms were extracted, try grepping for them
1288        let search_terms = self.extract_search_terms(query);
1289        if !search_terms.is_empty() {
1290            // Limit to top few most specific terms
1291            let top_terms: Vec<String> = search_terms.into_iter().take(3).collect();
1292
1293            for term in top_terms {
1294                if let Ok(grep_matches) = SearchTools::grep_search(&term, None, Some(root_dir)) {
1295                    // Take only top matches
1296                    for (path, _, _) in grep_matches.into_iter().take(5) {
1297                        if !results.contains(&path) && filter_gitignore(&path) {
1298                            results.push(path);
1299                            if results.len() >= max_files {
1300                                return Ok(results);
1301                            }
1302                        }
1303                    }
1304                }
1305            }
1306        }
1307
1308        // If we still need more files, use patterns based on query content
1309        if results.len() < max_files {
1310            // Get a smaller set of more targeted patterns
1311            let patterns = self.determine_relevant_files(query);
1312            let targeted_patterns: Vec<&String> = patterns.iter().take(5).collect();
1313
1314            for pattern in targeted_patterns {
1315                if let Ok(matches) = SearchTools::glob_search(pattern) {
1316                    for path in matches.into_iter().take(5) {
1317                        if !results.contains(&path) && filter_gitignore(&path) {
1318                            results.push(path);
1319                            if results.len() >= max_files {
1320                                return Ok(results);
1321                            }
1322                        }
1323                    }
1324                }
1325            }
1326        }
1327
1328        // If still not enough, add a few key project files
1329        if results.len() < 5 {
1330            let key_project_files = vec![
1331                "**/lib.rs",
1332                "**/main.rs",
1333                "**/mod.rs",
1334                "**/Cargo.toml",
1335                "**/package.json",
1336                "**/README.md",
1337            ];
1338
1339            for pattern in key_project_files {
1340                if let Ok(matches) = SearchTools::glob_search(pattern) {
1341                    for path in matches {
1342                        if !results.contains(&path) && filter_gitignore(&path) {
1343                            results.push(path);
1344                            if results.len() >= max_files {
1345                                return Ok(results);
1346                            }
1347                        }
1348                    }
1349                }
1350            }
1351        }
1352
1353        // Sort results by modification time to prioritize recently changed files
1354        results.sort_by(|a, b| {
1355            let a_modified = std::fs::metadata(a).and_then(|m| m.modified()).ok();
1356            let b_modified = std::fs::metadata(b).and_then(|m| m.modified()).ok();
1357            b_modified.cmp(&a_modified)
1358        });
1359
1360        Ok(results)
1361    }
1362
1363    /// Extract search terms from a query for grep search
1364    ///
1365    /// # Arguments
1366    /// - `query` - User query string
1367    ///
1368    /// # Returns
1369    /// - `Vec<String>` - Extracted search terms
1370    pub fn extract_search_terms(&self, query: &str) -> Vec<String> {
1371        let mut terms = Vec::new();
1372
1373        // Split query into words and look for potential code identifiers
1374        let words: Vec<&str> = query
1375            .split_whitespace()
1376            .filter(|w| w.len() > 3) // Skip short words
1377            .collect();
1378
1379        for word in words {
1380            // Clean up the word to extract potential identifiers
1381            let clean_word = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '_');
1382
1383            // Look for identifiers that match coding convention patterns
1384            if clean_word.len() > 3
1385                && clean_word.chars().all(|c| c.is_alphanumeric() || c == '_')
1386                && !clean_word.chars().all(|c| c.is_numeric())
1387            {
1388                // Skip common English words and programming keywords
1389                let common_words = [
1390                    "this",
1391                    "that",
1392                    "from",
1393                    "what",
1394                    "when",
1395                    "where",
1396                    "which",
1397                    "find",
1398                    "function",
1399                    "class",
1400                    "struct",
1401                    "impl",
1402                    "type",
1403                    "interface",
1404                    "const",
1405                    "static",
1406                    "public",
1407                    "private",
1408                    "protected",
1409                    "export",
1410                    "import",
1411                ];
1412
1413                if !common_words.contains(&clean_word.to_lowercase().as_str()) {
1414                    terms.push(clean_word.to_string());
1415                }
1416            }
1417        }
1418
1419        terms
1420    }
1421
1422    /// Parse an entire codebase and generate ASTs for selected files
1423    ///
1424    /// # Arguments
1425    /// - `root_dir` - Root directory of the codebase
1426    /// - `query` - User query to determine relevant files
1427    ///
1428    /// # Returns
1429    /// - `Result<Vec<CodeAST>>` - List of ASTs for relevant files
1430    pub fn parse_codebase(&mut self, root_dir: &Path, query: &str) -> Result<Vec<CodeAST>> {
1431        // Get files relevant to the query
1432        let relevant_files = self.find_relevant_files(root_dir, query)?;
1433
1434        // Use parallel processing for better performance
1435        let asts: Vec<Result<CodeAST>> = relevant_files
1436            .par_iter()
1437            .map(|path| {
1438                let mut local_parser = CodeParser::new()?;
1439                local_parser.parse_file(path)
1440            })
1441            .collect();
1442
1443        // Filter out errors and collect successful ASTs
1444        let valid_asts: Vec<CodeAST> = asts
1445            .into_iter()
1446            .filter_map(|ast_result| {
1447                // Just silently ignore parse errors since we're doing best-effort parsing
1448                // and may not need all files
1449                ast_result.ok()
1450            })
1451            .collect();
1452
1453        Ok(valid_asts)
1454    }
1455
1456    /// Generate a structured AST optimized for LLM consumption
1457    ///
1458    /// # Arguments
1459    /// - `root_dir` - Root directory of the codebase or path to a single file
1460    /// - `query` - User query to determine relevant files
1461    ///
1462    /// # Returns
1463    /// - `Result<String>` - Structured AST as a string
1464    pub fn generate_llm_friendly_ast(&mut self, root_dir: &Path, query: &str) -> Result<String> {
1465        // Check if the path is a file or directory
1466        let mut asts = if root_dir.is_file() {
1467            // Just parse this single file
1468            let ast = self.parse_file(root_dir)?;
1469            vec![ast]
1470        } else {
1471            // Parse the relevant parts of the codebase
1472            self.parse_codebase(root_dir, query)?
1473        };
1474
1475        // If no AST data was generated, return a helpful message
1476        if asts.is_empty() {
1477            return Ok(String::from("No relevant code structures found for the query. Try to be more specific about what code you're looking for."));
1478        }
1479
1480        // Sort ASTs by relevance (assuming more recently modified files are more relevant)
1481        asts.sort_by(|a, b| {
1482            let a_path = Path::new(&a.path);
1483            let b_path = Path::new(&b.path);
1484
1485            let a_modified = std::fs::metadata(a_path).and_then(|m| m.modified()).ok();
1486            let b_modified = std::fs::metadata(b_path).and_then(|m| m.modified()).ok();
1487
1488            b_modified.cmp(&a_modified)
1489        });
1490
1491        // Limit to most relevant files (10 max)
1492        if asts.len() > 10 {
1493            asts.truncate(10);
1494        }
1495
1496        // Create a structured code map that shows the hierarchy of code
1497        let mut structured_output = String::new();
1498        structured_output.push_str(&format!(
1499            "# Code Structure Analysis for Query: \"{}\"
1500
1501",
1502            query
1503        ));
1504
1505        // Add a hierarchical breakdown of each file
1506        structured_output.push_str(&format!(
1507            "## Codebase Structure Overview
1508
1509{} relevant files found. Showing hierarchical breakdown:
1510
1511",
1512            asts.len()
1513        ));
1514
1515        // Create a structured code map
1516        for ast in &asts {
1517            // Add file header
1518            structured_output.push_str(&format!("### File: {}\n", ast.path));
1519            structured_output.push_str(&format!("Language: {}\n\n", ast.language));
1520
1521            // Sort children by line number for logical ordering
1522            let mut ordered_children = ast.children.clone();
1523            ordered_children.sort_by_key(|child| child.range.start_row);
1524
1525            // Track seen types to avoid duplication in the output
1526            let mut seen_types = HashSet::new();
1527
1528            // Add each code structure with line numbers
1529            for child in &ordered_children {
1530                let name = child.name.as_deref().unwrap_or("anonymous");
1531
1532                // Skip if we've already seen this exact type+name combination
1533                let type_name_key = format!("{}:{}", child.kind, name);
1534                if seen_types.contains(&type_name_key) {
1535                    continue;
1536                }
1537                seen_types.insert(type_name_key);
1538
1539                structured_output.push_str(&format!(
1540                    "- {} `{}` (line {})\n",
1541                    child.kind,
1542                    name,
1543                    child.range.start_row + 1
1544                ));
1545
1546                // Add a code snippet if available
1547                if let Some(content) = &child.content {
1548                    // Get just the first line or a limited preview
1549                    let preview = content.lines().next().unwrap_or("");
1550                    if !preview.is_empty() {
1551                        structured_output
1552                            .push_str(&format!("  ```{}\n  {}\n  ```\n", ast.language, preview));
1553                    }
1554                }
1555
1556                // Add nested children if any (for hierarchical display)
1557                if !child.children.is_empty() {
1558                    for nested_child in &child.children {
1559                        if let Some(nested_name) = &nested_child.name {
1560                            structured_output.push_str(&format!(
1561                                "  - {} `{}` (line {})\n",
1562                                nested_child.kind,
1563                                nested_name,
1564                                nested_child.range.start_row + 1
1565                            ));
1566                        }
1567                    }
1568                }
1569            }
1570
1571            structured_output.push('\n');
1572        }
1573
1574        // Add a table of all identified symbols across files
1575        structured_output.push_str("## Symbol Table\n\n");
1576        structured_output.push_str("| Type | Name | File | Line |\n");
1577        structured_output.push_str("|------|------|------|------|\n");
1578
1579        // Collect all symbols for the table
1580        let mut all_symbols = Vec::new();
1581        for ast in &asts {
1582            for child in &ast.children {
1583                if let Some(name) = &child.name {
1584                    // Skip symbols with generic or empty names
1585                    if name == "anonymous" || name.is_empty() {
1586                        continue;
1587                    }
1588
1589                    all_symbols.push((
1590                        child.kind.clone(),
1591                        name.clone(),
1592                        ast.path.clone(),
1593                        child.range.start_row + 1,
1594                    ));
1595                }
1596            }
1597        }
1598
1599        // Sort symbols by type and name
1600        all_symbols.sort_by(|a, b| {
1601            let type_cmp = a.0.cmp(&b.0);
1602            if type_cmp == std::cmp::Ordering::Equal {
1603                a.1.cmp(&b.1)
1604            } else {
1605                type_cmp
1606            }
1607        });
1608
1609        // Add symbols to table
1610        for (kind, name, file, line) in all_symbols {
1611            // Extract just the file name for brevity
1612            let file_name = Path::new(&file)
1613                .file_name()
1614                .and_then(|n| n.to_str())
1615                .unwrap_or("unknown");
1616
1617            structured_output.push_str(&format!(
1618                "| {} | `{}` | {} | {} |\n",
1619                kind, name, file_name, line
1620            ));
1621        }
1622
1623        // Add a section for relationships between symbols
1624        structured_output.push_str("\n## Symbol Relationships\n\n");
1625        structured_output.push_str("This section shows relationships between code elements:\n\n");
1626
1627        // Extract relationships from the AST (like inheritance, implementation, etc.)
1628        let mut relationships = Vec::new();
1629
1630        for ast in &asts {
1631            // For Rust, look for impl blocks
1632            if ast.language == "rust" {
1633                for child in &ast.children {
1634                    if child.kind == "impl" {
1635                        if let Some(name) = &child.name {
1636                            relationships.push(format!(
1637                                "- `{}` implements trait/functionality for type `{}`",
1638                                ast.path, name
1639                            ));
1640                        }
1641                    }
1642                }
1643            }
1644
1645            // For other languages, look for inheritance/implementation patterns
1646            // (This would be expanded based on language-specific patterns)
1647        }
1648
1649        if !relationships.is_empty() {
1650            for relationship in relationships {
1651                structured_output.push_str(&format!("{}\n", relationship));
1652            }
1653        } else {
1654            structured_output.push_str("No clear relationships detected between symbols.\n");
1655        }
1656
1657        // Add the full AST data in JSON format for programmatic use
1658        // This is limited to avoid overwhelming the LLM with too much data
1659        structured_output.push_str("\n## AST Summary\n\n");
1660        // Instead of full JSON, provide a summary of what's available
1661        structured_output.push_str(&format!(
1662            "Analyzed {} files containing {} total code structures.\n",
1663            asts.len(),
1664            asts.iter().map(|ast| ast.children.len()).sum::<usize>()
1665        ));
1666
1667        Ok(structured_output)
1668    }
1669
1670    /// Determine which files to parse based on user query
1671    ///
1672    /// # Arguments
1673    /// - `query` - User query string
1674    ///
1675    /// # Returns
1676    /// - `Vec<String>` - List of glob patterns for relevant files
1677    pub fn determine_relevant_files(&self, query: &str) -> Vec<String> {
1678        let mut patterns = Vec::new();
1679
1680        // Look for specific file mentions in the query
1681        let file_regex = regex::Regex::new(r#"['"](\S+\.\w+)['"]"#).unwrap();
1682        for cap in file_regex.captures_iter(query) {
1683            if let Some(file_match) = cap.get(1) {
1684                let file_pattern = format!("**/{}", file_match.as_str());
1685                patterns.push(file_pattern);
1686            }
1687        }
1688
1689        // Add language-specific patterns based on query keywords
1690        let query_lower = query.to_lowercase();
1691
1692        // Rust patterns
1693        if query_lower.contains("rust") || query_lower.contains(".rs") {
1694            patterns.push("**/*.rs".to_string());
1695            patterns.push("**/src/**/*.rs".to_string());
1696            patterns.push("**/lib.rs".to_string());
1697            patterns.push("**/main.rs".to_string());
1698        }
1699
1700        // JavaScript patterns
1701        if query_lower.contains("javascript")
1702            || query_lower.contains("js")
1703            || query_lower.contains("node")
1704            || query_lower.contains("react")
1705        {
1706            patterns.push("**/*.js".to_string());
1707            patterns.push("**/*.jsx".to_string());
1708            patterns.push("**/src/**/*.js".to_string());
1709            patterns.push("**/src/**/*.jsx".to_string());
1710        }
1711
1712        // TypeScript patterns
1713        if query_lower.contains("typescript")
1714            || query_lower.contains("ts")
1715            || query_lower.contains("angular")
1716            || query_lower.contains("next")
1717        {
1718            patterns.push("**/*.ts".to_string());
1719            patterns.push("**/*.tsx".to_string());
1720            patterns.push("**/src/**/*.ts".to_string());
1721            patterns.push("**/src/**/*.tsx".to_string());
1722        }
1723
1724        // Python patterns
1725        if query_lower.contains("python")
1726            || query_lower.contains("py")
1727            || query_lower.contains("django")
1728            || query_lower.contains("flask")
1729        {
1730            patterns.push("**/*.py".to_string());
1731            patterns.push("**/src/**/*.py".to_string());
1732        }
1733
1734        // Go patterns
1735        if query_lower.contains("go") || query_lower.contains("golang") {
1736            patterns.push("**/*.go".to_string());
1737            patterns.push("**/src/**/*.go".to_string());
1738        }
1739
1740        // C/C++ patterns
1741        if query_lower.contains("c++")
1742            || query_lower.contains("cpp")
1743            || query_lower.contains(" c ")
1744            || query_lower.contains(".c")
1745        {
1746            patterns.push("**/*.c".to_string());
1747            patterns.push("**/*.h".to_string());
1748            patterns.push("**/*.cpp".to_string());
1749            patterns.push("**/*.hpp".to_string());
1750            patterns.push("**/*.cc".to_string());
1751        }
1752
1753        // Java patterns
1754        if query_lower.contains("java") && !query_lower.contains("javascript") {
1755            patterns.push("**/*.java".to_string());
1756            patterns.push("**/src/**/*.java".to_string());
1757        }
1758
1759        // Add patterns for common code directories if no specific language mentioned
1760        if patterns.is_empty() || !patterns.iter().any(|p| p.starts_with("**/src/")) {
1761            patterns.push("**/src/**/*.rs".to_string());
1762            patterns.push("**/src/**/*.ts".to_string());
1763            patterns.push("**/src/**/*.js".to_string());
1764            patterns.push("**/src/**/*.py".to_string());
1765        }
1766
1767        // Always add the language of the codebase (assuming Rust for oli)
1768        if !patterns.iter().any(|p| p.ends_with(".rs")) {
1769            patterns.push("**/*.rs".to_string());
1770        }
1771
1772        patterns
1773    }
1774}