go_brrr/ast/
extractor.rs

1//! AST extraction using tree-sitter.
2//!
3//! Provides efficient code structure extraction from source files using
4//! tree-sitter queries. Supports multiple languages through the Language trait.
5//!
6//! # Query Caching
7//!
8//! Tree-sitter query compilation is expensive (1-5ms per query). Since queries
9//! are immutable after creation, we cache them globally keyed by
10//! `(language_name, query_kind)`. This provides O(1) amortized access for
11//! multi-file operations, improving performance by 20-30%.
12//!
13//! # Parser Caching
14//!
15//! Tree-sitter parser creation involves memory allocation and language setup,
16//! which adds overhead when processing many files. We use thread-local parser
17//! caching to reuse parsers across files of the same language, with RAII-based
18//! automatic return to the cache. This provides ~10-20% speedup for multi-file
19//! operations.
20
21use std::cell::RefCell;
22use std::collections::{BTreeSet, HashMap};
23use std::ops::Bound;
24use std::path::Path;
25use std::sync::Arc;
26
27use once_cell::sync::Lazy;
28use parking_lot::RwLock;
29use tree_sitter::{Language as TSLanguage, Node, Parser, Query, QueryCursor, QueryMatch, Tree};
30
31use crate::ast::types::{ClassInfo, FunctionInfo, ImportInfo, ModuleInfo};
32use crate::error::{Result, BrrrError};
33use crate::lang::{Language, LanguageRegistry};
34use crate::util::format_query_error;
35
36/// Cache key for compiled tree-sitter queries.
37/// Uses `(language_name, query_kind)` to uniquely identify queries.
38type QueryCacheKey = (&'static str, &'static str);
39
40/// Thread-safe cache for compiled tree-sitter queries.
41///
42/// Stores `Arc<Query>` to allow shared access without cloning the Query itself.
43/// Uses `parking_lot::RwLock` for better performance than `std::sync::RwLock`.
44static QUERY_CACHE: Lazy<RwLock<HashMap<QueryCacheKey, Arc<Query>>>> =
45    Lazy::new(|| RwLock::new(HashMap::new()));
46
47/// Get or compile a tree-sitter query, using cache for repeated lookups.
48///
49/// This function provides O(1) amortized access to compiled queries by:
50/// 1. First checking the read-locked cache (fast path for cached queries)
51/// 2. On cache miss, compiling the query and storing it under write lock
52///
53/// # Arguments
54/// * `ts_lang` - Tree-sitter language for query compilation
55/// * `lang_name` - Static string identifier for the language
56/// * `query_kind` - Static string identifier for query type ("function", "class")
57/// * `query_str` - The tree-sitter query pattern to compile
58///
59/// # Returns
60/// * `Result<Arc<Query>>` - Shared reference to compiled query or compilation error
61///
62/// # Performance
63/// - First call for a (lang, kind) pair: O(query_compile_time) ~1-5ms
64/// - Subsequent calls: O(1) hash lookup ~10ns
65fn get_cached_query(
66    ts_lang: &TSLanguage,
67    lang_name: &'static str,
68    query_kind: &'static str,
69    query_str: &str,
70) -> Result<Arc<Query>> {
71    let key = (lang_name, query_kind);
72
73    // Fast path: check cache with read lock
74    {
75        let cache = QUERY_CACHE.read();
76        if let Some(query) = cache.get(&key) {
77            return Ok(Arc::clone(query));
78        }
79    }
80
81    // Slow path: compile query and cache it
82    let query = Query::new(ts_lang, query_str).map_err(|e| {
83        BrrrError::TreeSitter(format_query_error(lang_name, query_kind, query_str, &e))
84    })?;
85
86    let query_arc = Arc::new(query);
87
88    let mut cache = QUERY_CACHE.write();
89    // Double-check: another thread might have inserted while we were compiling
90    // Use entry API to avoid overwriting if already present
91    cache.entry(key).or_insert_with(|| Arc::clone(&query_arc));
92
93    Ok(query_arc)
94}
95
96/// Clear the query cache.
97///
98/// Useful for testing or when language configurations are updated.
99/// In normal operation, the cache persists for the lifetime of the process.
100#[allow(dead_code)]
101pub fn clear_query_cache() {
102    let mut cache = QUERY_CACHE.write();
103    cache.clear();
104}
105
106/// Get current query cache statistics.
107///
108/// Returns the number of cached queries. Useful for debugging and monitoring.
109#[allow(dead_code)]
110pub fn query_cache_stats() -> usize {
111    QUERY_CACHE.read().len()
112}
113
114// =============================================================================
115// Parser Caching
116// =============================================================================
117//
118// Tree-sitter parsers are expensive to create (~100-500us) due to memory
119// allocation and language grammar setup. Since parsers maintain mutable state
120// during parsing, they cannot be shared across threads. We use thread-local
121// storage to cache one parser per language per thread.
122//
123// Key design decisions:
124// - Thread-local: Parsers are not thread-safe, avoiding synchronization overhead
125// - RAII wrapper: Automatic return to cache via Drop, ensuring cleanup on errors
126// - Size limit: Prevents unbounded memory growth (max 16 parsers per thread)
127// - Reset on reuse: Clears parser state for fresh parsing
128// =============================================================================
129
130thread_local! {
131    /// Thread-local parser cache for reusing parsers across file operations.
132    ///
133    /// Stores one parser per language name. When a parser is needed, it's removed
134    /// from the cache (if present), reset, and wrapped in CachedParser. On drop,
135    /// the parser is returned to the cache for reuse.
136    static PARSER_CACHE: RefCell<HashMap<&'static str, Parser>> = RefCell::new(HashMap::new());
137}
138
139/// Maximum number of parsers to cache per thread.
140///
141/// Limits memory usage while covering common multi-language projects.
142/// 16 parsers covers all supported languages (Python, TypeScript, TSX, Go,
143/// Rust, Java, C, C++) with room for future additions.
144const MAX_CACHED_PARSERS: usize = 16;
145
146/// RAII wrapper for cached parser access.
147///
148/// Automatically returns the parser to the thread-local cache on drop,
149/// ensuring proper cleanup even when errors occur. This pattern is essential
150/// because tree-sitter's Parser doesn't implement Clone.
151///
152/// # Example
153///
154/// ```ignore
155/// let mut cached = CachedParser::take(lang)?;
156/// let tree = cached.get_mut().parse(&source, None);
157/// // Parser automatically returned to cache when `cached` is dropped
158/// ```
159struct CachedParser {
160    /// The parser being used. Option allows taking ownership on drop.
161    parser: Option<Parser>,
162    /// Language name for cache key on return.
163    lang_name: &'static str,
164}
165
166impl CachedParser {
167    /// Get a parser for the given language, reusing cached parser if available.
168    ///
169    /// If a cached parser exists for this language, it is removed from the cache,
170    /// reset to clear previous state, and returned. Otherwise, a new parser is
171    /// created via the Language trait.
172    ///
173    /// # Performance
174    ///
175    /// - Cache hit: O(1) hash lookup + reset ~1-5us
176    /// - Cache miss: O(parser_creation_time) ~100-500us
177    ///
178    /// # Errors
179    ///
180    /// Returns an error if parser creation fails (e.g., invalid language grammar).
181    fn take(lang: &dyn Language) -> Result<Self> {
182        let lang_name = lang.name();
183
184        // Try to get cached parser for this language
185        let cached = PARSER_CACHE.with(|cache| cache.borrow_mut().remove(lang_name));
186
187        let parser = match cached {
188            Some(mut p) => {
189                // Reset parser to clear previous parsing state (tree references, etc.)
190                p.reset();
191                p
192            }
193            None => {
194                // Create new parser - this is the expensive path
195                lang.parser()?
196            }
197        };
198
199        Ok(Self {
200            parser: Some(parser),
201            lang_name,
202        })
203    }
204
205    /// Get mutable reference to the underlying parser.
206    ///
207    /// # Panics
208    ///
209    /// Panics if called after the parser has been consumed (should never happen
210    /// in normal use since we only consume on drop).
211    fn get_mut(&mut self) -> &mut Parser {
212        self.parser.as_mut().expect("Parser already consumed")
213    }
214}
215
216impl Drop for CachedParser {
217    fn drop(&mut self) {
218        if let Some(parser) = self.parser.take() {
219            PARSER_CACHE.with(|cache| {
220                let mut cache = cache.borrow_mut();
221                // Only cache if under limit to prevent memory bloat
222                if cache.len() < MAX_CACHED_PARSERS {
223                    cache.insert(self.lang_name, parser);
224                }
225                // If over limit, parser is simply dropped (freed)
226            });
227        }
228    }
229}
230
231/// Clear the parser cache for the current thread.
232///
233/// Useful for testing or when language configurations change.
234/// In production, the cache persists for the thread's lifetime.
235#[allow(dead_code)]
236pub fn clear_parser_cache() {
237    PARSER_CACHE.with(|cache| {
238        cache.borrow_mut().clear();
239    });
240}
241
242/// Get current parser cache statistics for the current thread.
243///
244/// Returns the number of cached parsers. Useful for debugging and monitoring.
245#[allow(dead_code)]
246pub fn parser_cache_stats() -> usize {
247    PARSER_CACHE.with(|cache| cache.borrow().len())
248}
249
250/// Efficient position set for O(log n) proximity-based deduplication.
251///
252/// Uses a BTreeSet to detect if a position is within a small tolerance
253/// (default 2 bytes) of any previously seen position. This replaces
254/// linear O(n) scanning with O(log n) range queries.
255///
256/// # Performance
257/// - Insert: O(log n)
258/// - Proximity check: O(log n) using range queries
259/// - Overall for n items: O(n log n) instead of O(n^2)
260///
261/// For a file with 500 functions, this reduces comparisons from
262/// 125,000 (n^2/2) to approximately 4,500 (n * log n).
263struct PositionSet {
264    /// Sorted set of seen start positions
265    positions: BTreeSet<usize>,
266    /// Tolerance for "close enough" position matching (in bytes)
267    tolerance: usize,
268}
269
270impl PositionSet {
271    /// Create a new PositionSet with the specified tolerance.
272    ///
273    /// # Arguments
274    /// * `tolerance` - Maximum byte distance to consider positions as duplicates
275    fn with_tolerance(tolerance: usize) -> Self {
276        Self {
277            positions: BTreeSet::new(),
278            tolerance,
279        }
280    }
281
282    /// Check if a position is a duplicate (within tolerance of any existing position).
283    ///
284    /// Uses BTreeSet range queries for O(log n) performance:
285    /// - Finds positions in range [pos - tolerance, pos + tolerance]
286    /// - Returns true if any position in that range exists
287    ///
288    /// # Arguments
289    /// * `pos` - The position to check
290    ///
291    /// # Returns
292    /// * `true` if this position is within `tolerance` bytes of an existing position
293    fn is_duplicate(&self, pos: usize) -> bool {
294        // Calculate search range, handling underflow for positions near 0
295        let lower = pos.saturating_sub(self.tolerance);
296        let upper = pos.saturating_add(self.tolerance);
297
298        // Use range query to find any position in [lower, upper]
299        // This is O(log n) to find the starting point, then O(k) where k is matches
300        // In practice k is 0 or 1, making this effectively O(log n)
301        self.positions
302            .range((Bound::Included(lower), Bound::Included(upper)))
303            .next()
304            .is_some()
305    }
306
307    /// Insert a position into the set.
308    ///
309    /// # Arguments
310    /// * `pos` - The position to insert
311    fn insert(&mut self, pos: usize) {
312        self.positions.insert(pos);
313    }
314}
315
316/// Known function node kinds across all supported languages.
317///
318/// When the @function capture is missing from a query pattern, we use these
319/// node kinds to identify the correct function definition node from captures.
320/// This handles cases where queries use alternative capture names like @method
321/// or @constructor (Java), or patterns without explicit function captures.
322const FUNCTION_NODE_KINDS: &[&str] = &[
323    // Python
324    "function_definition",
325    "decorated_definition",
326    // TypeScript/JavaScript
327    "function_declaration",
328    "method_definition",
329    "arrow_function",
330    "function_expression",
331    "generator_function_declaration",
332    "function_signature",
333    "ambient_declaration",
334    // Go
335    "function_declaration",
336    "method_declaration",
337    // Rust
338    "function_item",
339    "function_signature_item",
340    "macro_definition",
341    // C/C++
342    "function_definition",
343    "declaration",
344    "template_declaration",
345    "preproc_def",
346    "preproc_function_def",
347    "type_definition",
348    // Java
349    "method_declaration",
350    "constructor_declaration",
351];
352
353/// Known class/type node kinds across all supported languages.
354///
355/// When the @class capture is missing from a query pattern, we use these
356/// node kinds to identify the correct class/struct/interface node from captures.
357/// This handles cases where queries use alternative capture names like @struct,
358/// @interface, @enum (Go, C, C++), or other type-related patterns.
359const CLASS_NODE_KINDS: &[&str] = &[
360    // Python
361    "class_definition",
362    "decorated_definition",
363    // TypeScript/JavaScript
364    "class_declaration",
365    "abstract_class_declaration",
366    "class", // class expression
367    "interface_declaration",
368    "enum_declaration",
369    "type_alias_declaration",
370    "module",
371    // Go
372    "type_declaration",
373    // Rust
374    "struct_item",
375    "union_item",
376    "impl_item",
377    "trait_item",
378    "enum_item",
379    "const_item",
380    "static_item",
381    "type_item",
382    "mod_item",
383    "foreign_mod_item",
384    "extern_crate_declaration",
385    // C/C++
386    "struct_specifier",
387    "enum_specifier",
388    "union_specifier",
389    "class_specifier",
390    "namespace_definition",
391    "type_definition",
392    "preproc_ifdef",
393    "preproc_if",
394    // Java
395    "class_declaration",
396    "interface_declaration",
397    "enum_declaration",
398    "record_declaration",
399    "annotation_type_declaration",
400];
401
402/// Extract the function node from a query match using robust fallback logic.
403///
404/// This function implements a three-tier selection strategy:
405/// 1. Try to find a capture with the exact name "function"
406/// 2. Fall back to finding a capture whose node kind matches known function types
407/// 3. Last resort: return the first capture's node
408///
409/// This handles cases where:
410/// - Different languages use different capture names (@method, @constructor)
411/// - Query patterns don't have explicit @function captures
412/// - Multiple captures exist and the first one is a child node (like @name)
413fn get_function_node_from_match<'tree>(
414    match_: &QueryMatch<'_, 'tree>,
415    query: &Query,
416) -> Option<Node<'tree>> {
417    // Strategy 1: Try to find @function capture by name
418    if let Some(idx) = query.capture_index_for_name("function") {
419        if let Some(capture) = match_.captures.iter().find(|c| c.index == idx) {
420            return Some(capture.node);
421        }
422    }
423
424    // Strategy 2: Look for captures with known function node kinds
425    // This handles Java (@method, @constructor), etc.
426    for capture in match_.captures.iter() {
427        if FUNCTION_NODE_KINDS.contains(&capture.node.kind()) {
428            return Some(capture.node);
429        }
430    }
431
432    // Strategy 3: Last resort - first capture
433    // This may still be wrong if the first capture is @name, but it's the best we can do
434    match_.captures.first().map(|c| c.node)
435}
436
437/// Extract the class node from a query match using robust fallback logic.
438///
439/// This function implements a three-tier selection strategy:
440/// 1. Try to find a capture with the exact name "class"
441/// 2. Fall back to finding a capture whose node kind matches known class/type kinds
442/// 3. Last resort: return the first capture's node
443///
444/// This handles cases where:
445/// - Different languages use different capture names (@struct, @interface, @enum)
446/// - Query patterns don't have explicit @class captures
447/// - Multiple captures exist and the first one is a child node (like @name)
448fn get_class_node_from_match<'tree>(
449    match_: &QueryMatch<'_, 'tree>,
450    query: &Query,
451) -> Option<Node<'tree>> {
452    // Strategy 1: Try to find @class capture by name
453    if let Some(idx) = query.capture_index_for_name("class") {
454        if let Some(capture) = match_.captures.iter().find(|c| c.index == idx) {
455            return Some(capture.node);
456        }
457    }
458
459    // Strategy 2: Look for captures with known class node kinds
460    // This handles Go (@struct, @interface), C/C++ (@struct, @enum, @namespace), etc.
461    for capture in match_.captures.iter() {
462        if CLASS_NODE_KINDS.contains(&capture.node.kind()) {
463            return Some(capture.node);
464        }
465    }
466
467    // Strategy 3: Last resort - first capture
468    match_.captures.first().map(|c| c.node)
469}
470
471/// Extracts AST information from source files.
472///
473/// Uses tree-sitter queries for efficient pattern matching and delegates
474/// to language-specific implementations for detailed extraction.
475pub struct AstExtractor;
476
477impl AstExtractor {
478    /// Extract full module info from a file.
479    ///
480    /// This is the main entry point for AST extraction. It:
481    /// 1. Detects the language from file extension
482    /// 2. Parses the file with tree-sitter
483    /// 3. Extracts functions, classes, and imports
484    /// 4. Returns a complete ModuleInfo structure
485    ///
486    /// # Arguments
487    /// * `path` - Path to the source file
488    ///
489    /// # Returns
490    /// * `Result<ModuleInfo>` - Extracted module information or error
491    ///
492    /// # Errors
493    /// * `BrrrError::UnsupportedLanguage` - File extension not recognized
494    /// * `BrrrError::Io` - File could not be read
495    /// * `BrrrError::Parse` - File could not be parsed
496    pub fn extract_file(path: &Path) -> Result<ModuleInfo> {
497        let registry = LanguageRegistry::global();
498        let lang = registry.detect_language(path).ok_or_else(|| {
499            BrrrError::UnsupportedLanguage(
500                path.extension()
501                    .and_then(|e| e.to_str())
502                    .unwrap_or("unknown")
503                    .to_string(),
504            )
505        })?;
506
507        let source = std::fs::read(path)
508            .map_err(|e| BrrrError::io_with_path(e, path))?;
509
510        // Check if the language handler wants to skip this file based on content.
511        // This is used by the C handler to skip `.h` files that contain C++ code,
512        // preventing parse errors when processing mixed C/C++ projects.
513        if lang.should_skip_file(path, &source) {
514            return Err(BrrrError::UnsupportedLanguage(format!(
515                "File content incompatible with {} parser: {}",
516                lang.name(),
517                path.display()
518            )));
519        }
520
521        // Use cached parser for performance - avoids recreating parser for each file.
522        // The CachedParser RAII wrapper ensures the parser is returned to the
523        // thread-local cache on drop, even if an error occurs during parsing.
524        let mut cached_parser = CachedParser::take(lang)?;
525        let tree = cached_parser
526            .get_mut()
527            .parse(&source, None)
528            .ok_or_else(|| BrrrError::Parse {
529                file: path.display().to_string(),
530                message: "Failed to parse file".to_string(),
531            })?;
532
533        Self::extract_module(&tree, &source, lang, path)
534    }
535
536    /// Extract module information from a parsed tree.
537    ///
538    /// Combines function, class, and import extraction into a complete
539    /// ModuleInfo structure.
540    fn extract_module(
541        tree: &Tree,
542        source: &[u8],
543        lang: &dyn Language,
544        path: &Path,
545    ) -> Result<ModuleInfo> {
546        let functions = Self::extract_functions(tree, source, lang)?;
547        let classes = Self::extract_classes(tree, source, lang)?;
548        let imports = lang.extract_imports(tree, source);
549        let docstring = lang.extract_module_docstring(tree, source);
550
551        Ok(ModuleInfo {
552            path: path.display().to_string(),
553            language: lang.name().to_string(),
554            docstring,
555            functions,
556            classes,
557            imports,
558            call_graph: None, // Call graph extraction is done separately via callgraph module
559        })
560    }
561
562    /// Extract all functions from a parsed tree using tree-sitter queries.
563    ///
564    /// Uses the language's function query pattern to find all function
565    /// definitions, then delegates to the language's extract_function
566    /// method for detailed extraction.
567    ///
568    /// # Arguments
569    /// * `tree` - Parsed tree-sitter tree
570    /// * `source` - Source code bytes
571    /// * `lang` - Language implementation
572    ///
573    /// # Returns
574    /// * `Result<Vec<FunctionInfo>>` - Extracted functions or error
575    fn extract_functions(
576        tree: &Tree,
577        source: &[u8],
578        lang: &dyn Language,
579    ) -> Result<Vec<FunctionInfo>> {
580        let query_str = lang.function_query();
581        let ts_lang = tree.language();
582
583        // Use cached query instead of compiling every time
584        // This provides ~20-30% speedup for multi-file operations
585        let query = get_cached_query(&ts_lang, lang.name(), "function", query_str)?;
586
587        let mut cursor = QueryCursor::new();
588        let mut matches = cursor.matches(&query, tree.root_node(), source);
589
590        let mut functions = Vec::new();
591        // Use PositionSet for O(log n) deduplication instead of O(n) linear search
592        // Tolerance of 2 bytes handles decorated_definition vs function_definition offset
593        let mut seen_positions = PositionSet::with_tolerance(2);
594
595        // Use streaming iterator pattern (tree-sitter 0.24+)
596        use streaming_iterator::StreamingIterator;
597        while let Some(match_) = matches.next() {
598            // Get the function node using robust fallback logic
599            // This handles cases where @function capture is missing (e.g., Java uses @method/@constructor)
600            // or where multiple captures exist and the first one is a child node (like @name)
601            let node = get_function_node_from_match(match_, &query);
602
603            if let Some(node) = node {
604                // Skip if we've already processed a function at this exact position
605                // (handles decorated functions where both the decorator and inner function match)
606                //
607                // We use start position only, not full range overlap, because:
608                // - Decorated functions match twice (decorated_definition and function_definition)
609                //   but they have the same or very close start positions
610                // - Nested functions have different start positions and should NOT be skipped
611                //
612                // Example: @decorator\ndef foo(): pass
613                // - decorated_definition starts at byte 0 (the @)
614                // - function_definition starts at byte ~11 (the def)
615                // These should deduplicate (we keep decorated_definition because it matches first)
616                //
617                // Example: def outer():\n    def inner(): pass
618                // - outer starts at byte 0
619                // - inner starts at byte ~17
620                // These should NOT deduplicate (different functions)
621                let start = node.start_byte();
622
623                // O(log n) duplicate check using BTreeSet range query
624                if seen_positions.is_duplicate(start) {
625                    continue;
626                }
627                seen_positions.insert(start);
628
629                if let Some(func_info) = lang.extract_function(node, source) {
630                    functions.push(func_info);
631                }
632            }
633        }
634
635        // Sort by line number for consistent output
636        functions.sort_by_key(|f| f.line_number);
637        Ok(functions)
638    }
639
640    /// Extract all classes from a parsed tree using tree-sitter queries.
641    ///
642    /// Uses the language's class query pattern to find all class
643    /// definitions, then delegates to the language's extract_class
644    /// method for detailed extraction including methods.
645    ///
646    /// # Arguments
647    /// * `tree` - Parsed tree-sitter tree
648    /// * `source` - Source code bytes
649    /// * `lang` - Language implementation
650    ///
651    /// # Returns
652    /// * `Result<Vec<ClassInfo>>` - Extracted classes or error
653    fn extract_classes(tree: &Tree, source: &[u8], lang: &dyn Language) -> Result<Vec<ClassInfo>> {
654        let query_str = lang.class_query();
655        let ts_lang = tree.language();
656
657        // Use cached query instead of compiling every time
658        // This provides ~20-30% speedup for multi-file operations
659        let query = get_cached_query(&ts_lang, lang.name(), "class", query_str)?;
660
661        let mut cursor = QueryCursor::new();
662        let mut matches = cursor.matches(&query, tree.root_node(), source);
663
664        let mut classes = Vec::new();
665        // Use PositionSet for O(log n) deduplication instead of O(n) linear search
666        // Tolerance of 2 bytes handles decorated_definition vs class_definition offset
667        let mut seen_positions = PositionSet::with_tolerance(2);
668
669        // Use streaming iterator pattern (tree-sitter 0.24+)
670        use streaming_iterator::StreamingIterator;
671        while let Some(match_) = matches.next() {
672            // Get the class node using robust fallback logic
673            // This handles cases where @class capture is missing (e.g., Go uses @struct/@interface,
674            // C/C++ uses @struct/@enum/@namespace) or where multiple captures exist
675            // and the first one is a child node (like @name)
676            let node = get_class_node_from_match(match_, &query);
677
678            if let Some(node) = node {
679                // Skip if we've already processed a class at this exact position
680                // (handles decorated classes where both the decorator and inner class match)
681                //
682                // We use start position only, not full range overlap, because:
683                // - Decorated classes match twice (decorated_definition and class_definition)
684                //   but they have close start positions
685                // - Nested classes have different start positions and should NOT be skipped
686                let start = node.start_byte();
687
688                // O(log n) duplicate check using BTreeSet range query
689                if seen_positions.is_duplicate(start) {
690                    continue;
691                }
692                seen_positions.insert(start);
693
694                if let Some(class_info) = lang.extract_class(node, source) {
695                    // The tree-sitter query matches ALL class definitions including nested ones,
696                    // so each class is already returned as a separate entry. We don't need to
697                    // flatten inner_classes - they're already matched by the query.
698                    // The inner_classes field still contains the hierarchical structure for
699                    // consumers who want to traverse the nesting relationship.
700                    classes.push(class_info);
701                }
702            }
703        }
704
705        // Sort by line number for consistent output
706        classes.sort_by_key(|c| c.line_number);
707        Ok(classes)
708    }
709
710    /// Extract functions and classes from source code string.
711    ///
712    /// Convenience method for extracting from in-memory source code
713    /// when the language is already known.
714    ///
715    /// # Arguments
716    /// * `source` - Source code as string
717    /// * `language` - Language name (e.g., "python", "typescript")
718    ///
719    /// # Returns
720    /// * `Result<ModuleInfo>` - Extracted module information
721    #[allow(dead_code)]
722    pub fn extract_from_source(source: &str, language: &str) -> Result<ModuleInfo> {
723        let registry = LanguageRegistry::global();
724        let lang = registry
725            .get_by_name(language)
726            .ok_or_else(|| BrrrError::UnsupportedLanguage(language.to_string()))?;
727
728        let source_bytes = source.as_bytes();
729
730        // Use cached parser for performance - reuses parser across multiple
731        // extract_from_source calls with the same language.
732        let mut cached_parser = CachedParser::take(lang)?;
733        let tree = cached_parser
734            .get_mut()
735            .parse(source_bytes, None)
736            .ok_or_else(|| BrrrError::Parse {
737                file: "<string>".to_string(),
738                message: "Failed to parse source".to_string(),
739            })?;
740
741        let functions = Self::extract_functions(&tree, source_bytes, lang)?;
742        let classes = Self::extract_classes(&tree, source_bytes, lang)?;
743        let imports = lang.extract_imports(&tree, source_bytes);
744        let docstring = lang.extract_module_docstring(&tree, source_bytes);
745
746        Ok(ModuleInfo {
747            path: "<string>".to_string(),
748            language: lang.name().to_string(),
749            docstring,
750            functions,
751            classes,
752            imports,
753            call_graph: None, // Call graph extraction is done separately via callgraph module
754        })
755    }
756
757    /// Find a specific function by name in a file.
758    ///
759    /// # Arguments
760    /// * `path` - Path to source file
761    /// * `function_name` - Name of function to find
762    ///
763    /// # Returns
764    /// * `Result<FunctionInfo>` - Function info or FunctionNotFound error
765    #[allow(dead_code)]
766    pub fn find_function(path: &Path, function_name: &str) -> Result<FunctionInfo> {
767        let module_info = Self::extract_file(path)?;
768
769        // Search in top-level functions
770        if let Some(func) = module_info
771            .functions
772            .iter()
773            .find(|f| f.name == function_name)
774        {
775            return Ok(func.clone());
776        }
777
778        // Search in class methods
779        for class in &module_info.classes {
780            if let Some(method) = class.methods.iter().find(|m| m.name == function_name) {
781                return Ok(method.clone());
782            }
783        }
784
785        Err(BrrrError::FunctionNotFound(function_name.to_string()))
786    }
787
788    /// Find a specific class by name in a file.
789    ///
790    /// # Arguments
791    /// * `path` - Path to source file
792    /// * `class_name` - Name of class to find
793    ///
794    /// # Returns
795    /// * `Result<ClassInfo>` - Class info or error
796    #[allow(dead_code)]
797    pub fn find_class(path: &Path, class_name: &str) -> Result<ClassInfo> {
798        let module_info = Self::extract_file(path)?;
799
800        module_info
801            .classes
802            .into_iter()
803            .find(|c| c.name == class_name)
804            .ok_or_else(|| BrrrError::ClassNotFound(class_name.to_string()))
805    }
806}
807
808/// Extract imports from a source file.
809///
810/// Convenience function for extracting only import statements.
811pub fn extract_imports(path: &Path) -> Result<Vec<ImportInfo>> {
812    let registry = LanguageRegistry::global();
813    let lang = registry.detect_language(path).ok_or_else(|| {
814        BrrrError::UnsupportedLanguage(
815            path.extension()
816                .and_then(|e| e.to_str())
817                .unwrap_or("unknown")
818                .to_string(),
819        )
820    })?;
821
822    let source = std::fs::read(path)
823        .map_err(|e| BrrrError::io_with_path(e, path))?;
824
825    // Use cached parser for performance - avoids recreating parser for each file.
826    let mut cached_parser = CachedParser::take(lang)?;
827    let tree = cached_parser
828        .get_mut()
829        .parse(&source, None)
830        .ok_or_else(|| BrrrError::Parse {
831            file: path.display().to_string(),
832            message: "Failed to parse file".to_string(),
833        })?;
834
835    Ok(lang.extract_imports(&tree, &source))
836}
837
838#[cfg(test)]
839mod tests {
840    use super::*;
841    use std::io::Write;
842    use tempfile::NamedTempFile;
843
844    fn create_temp_file(content: &str, extension: &str) -> NamedTempFile {
845        let mut file = tempfile::Builder::new()
846            .suffix(extension)
847            .tempfile()
848            .unwrap();
849        file.write_all(content.as_bytes()).unwrap();
850        file
851    }
852
853    #[test]
854    fn test_extract_python_functions() {
855        let source = r#"
856def hello(name: str) -> str:
857    """Say hello to someone."""
858    return f"Hello, {name}!"
859
860async def fetch_data(url: str) -> bytes:
861    """Fetch data from URL."""
862    pass
863
864class MyClass:
865    def method(self, x: int) -> int:
866        return x * 2
867"#;
868        let file = create_temp_file(source, ".py");
869        let result = AstExtractor::extract_file(file.path()).unwrap();
870
871        assert_eq!(result.language, "python");
872        assert!(result.functions.len() >= 2);
873
874        // Find hello function
875        let hello = result.functions.iter().find(|f| f.name == "hello");
876        assert!(hello.is_some());
877        let hello = hello.unwrap();
878        assert_eq!(hello.return_type, Some("str".to_string()));
879        assert!(hello
880            .docstring
881            .as_ref()
882            .map_or(false, |d| d.contains("Say hello")));
883        assert!(!hello.is_async);
884
885        // Find async function
886        let fetch = result.functions.iter().find(|f| f.name == "fetch_data");
887        assert!(fetch.is_some());
888        assert!(fetch.unwrap().is_async);
889
890        // Check class was extracted
891        assert_eq!(result.classes.len(), 1);
892        assert_eq!(result.classes[0].name, "MyClass");
893        assert!(!result.classes[0].methods.is_empty());
894    }
895
896    #[test]
897    fn test_extract_python_classes() {
898        let source = r#"
899class Animal:
900    """Base class for animals."""
901
902    def __init__(self, name: str):
903        self.name = name
904
905    def speak(self) -> str:
906        pass
907
908class Dog(Animal):
909    """A dog."""
910
911    def speak(self) -> str:
912        return "Woof!"
913"#;
914        let file = create_temp_file(source, ".py");
915        let result = AstExtractor::extract_file(file.path()).unwrap();
916
917        assert_eq!(result.classes.len(), 2);
918
919        let animal = result.classes.iter().find(|c| c.name == "Animal").unwrap();
920        assert!(animal
921            .docstring
922            .as_ref()
923            .map_or(false, |d| d.contains("Base class")));
924        assert!(animal.methods.len() >= 2);
925
926        let dog = result.classes.iter().find(|c| c.name == "Dog").unwrap();
927        assert!(dog.bases.contains(&"Animal".to_string()));
928    }
929
930    #[test]
931    fn test_extract_python_imports() {
932        let source = r#"
933import os
934import sys as system
935from pathlib import Path
936from collections import defaultdict as dd
937from . import local
938"#;
939        let file = create_temp_file(source, ".py");
940        let imports = extract_imports(file.path()).unwrap();
941
942        assert!(imports.len() >= 4);
943
944        // Check regular import
945        let os_import = imports.iter().find(|i| i.module == "os");
946        assert!(os_import.is_some());
947        assert!(!os_import.unwrap().is_from);
948
949        // Check aliased import
950        let sys_import = imports.iter().find(|i| i.module == "sys");
951        assert!(sys_import.is_some());
952        assert!(sys_import.unwrap().aliases.contains_key("sys"));
953
954        // Check from import
955        let pathlib_import = imports.iter().find(|i| i.module == "pathlib");
956        assert!(pathlib_import.is_some());
957        assert!(pathlib_import.unwrap().is_from);
958        assert!(pathlib_import.unwrap().names.contains(&"Path".to_string()));
959    }
960
961    #[test]
962    fn test_extract_typescript_functions() {
963        let source = r#"
964function greet(name: string): string {
965    return "Hello, " + name;
966}
967
968async function fetchData(url: string): Promise<Response> {
969    return fetch(url);
970}
971
972const add = (a: number, b: number): number => a + b;
973"#;
974        let file = create_temp_file(source, ".ts");
975        let result = AstExtractor::extract_file(file.path()).unwrap();
976
977        assert_eq!(result.language, "typescript");
978        assert!(result.functions.len() >= 2);
979
980        let greet = result.functions.iter().find(|f| f.name == "greet");
981        assert!(greet.is_some());
982        assert_eq!(greet.unwrap().return_type, Some("string".to_string()));
983
984        let fetch_data = result.functions.iter().find(|f| f.name == "fetchData");
985        assert!(fetch_data.is_some());
986        assert!(fetch_data.unwrap().is_async);
987    }
988
989    #[test]
990    fn test_extract_typescript_classes() {
991        let source = r#"
992class Animal {
993    constructor(public name: string) {}
994
995    speak(): void {
996        console.log(this.name);
997    }
998}
999
1000class Dog extends Animal {
1001    bark(): void {
1002        console.log("Woof!");
1003    }
1004}
1005"#;
1006        let file = create_temp_file(source, ".ts");
1007        let result = AstExtractor::extract_file(file.path()).unwrap();
1008
1009        assert_eq!(result.classes.len(), 2);
1010
1011        let animal = result.classes.iter().find(|c| c.name == "Animal");
1012        assert!(animal.is_some());
1013
1014        let dog = result.classes.iter().find(|c| c.name == "Dog");
1015        assert!(dog.is_some());
1016        assert!(dog.unwrap().bases.contains(&"Animal".to_string()));
1017    }
1018
1019    #[test]
1020    fn test_extract_from_source() {
1021        let source = r#"
1022def add(a: int, b: int) -> int:
1023    return a + b
1024"#;
1025        let result = AstExtractor::extract_from_source(source, "python").unwrap();
1026
1027        assert_eq!(result.language, "python");
1028        assert_eq!(result.functions.len(), 1);
1029        assert_eq!(result.functions[0].name, "add");
1030    }
1031
1032    #[test]
1033    fn test_find_function() {
1034        let source = r#"
1035def target_function(x: int) -> int:
1036    return x * 2
1037
1038def other_function():
1039    pass
1040"#;
1041        let file = create_temp_file(source, ".py");
1042
1043        let func = AstExtractor::find_function(file.path(), "target_function");
1044        assert!(func.is_ok());
1045        assert_eq!(func.unwrap().name, "target_function");
1046
1047        let not_found = AstExtractor::find_function(file.path(), "nonexistent");
1048        assert!(not_found.is_err());
1049    }
1050
1051    #[test]
1052    fn test_find_class() {
1053        let source = r#"
1054class TargetClass:
1055    pass
1056
1057class OtherClass:
1058    pass
1059"#;
1060        let file = create_temp_file(source, ".py");
1061
1062        let class = AstExtractor::find_class(file.path(), "TargetClass");
1063        assert!(class.is_ok());
1064        assert_eq!(class.unwrap().name, "TargetClass");
1065
1066        let not_found = AstExtractor::find_class(file.path(), "NonexistentClass");
1067        assert!(not_found.is_err());
1068        assert!(matches!(not_found, Err(BrrrError::ClassNotFound(_))));
1069    }
1070
1071    #[test]
1072    fn test_unsupported_language() {
1073        let file = create_temp_file("some content", ".xyz");
1074        let result = AstExtractor::extract_file(file.path());
1075
1076        assert!(matches!(result, Err(BrrrError::UnsupportedLanguage(_))));
1077    }
1078
1079    #[test]
1080    fn test_decorated_python_function() {
1081        let source = r#"
1082@staticmethod
1083@cache
1084def cached_function(x: int) -> int:
1085    return x * 2
1086"#;
1087        let file = create_temp_file(source, ".py");
1088        let result = AstExtractor::extract_file(file.path()).unwrap();
1089
1090        assert_eq!(result.functions.len(), 1);
1091        let func = &result.functions[0];
1092        assert_eq!(func.name, "cached_function");
1093        assert!(!func.decorators.is_empty());
1094    }
1095
1096    #[test]
1097    fn test_decorated_python_class() {
1098        let source = r#"
1099@dataclass
1100class Point:
1101    x: float
1102    y: float
1103"#;
1104        let file = create_temp_file(source, ".py");
1105        let result = AstExtractor::extract_file(file.path()).unwrap();
1106
1107        assert_eq!(result.classes.len(), 1);
1108        let class = &result.classes[0];
1109        assert_eq!(class.name, "Point");
1110        assert!(!class.decorators.is_empty());
1111    }
1112
1113    #[test]
1114    fn test_multiple_decorated_functions_no_duplicates() {
1115        // This test verifies that the overlap detection algorithm correctly
1116        // handles multiple decorated functions without producing duplicates.
1117        // The fix ensures partial overlaps are detected, not just containment.
1118        let source = r#"
1119@decorator1
1120def func1():
1121    pass
1122
1123@decorator2
1124@decorator3
1125def func2():
1126    pass
1127
1128@contextmanager
1129def func3():
1130    yield
1131
1132def plain_func():
1133    pass
1134"#;
1135        let file = create_temp_file(source, ".py");
1136        let result = AstExtractor::extract_file(file.path()).unwrap();
1137
1138        // Should extract exactly 4 functions, no duplicates
1139        assert_eq!(
1140            result.functions.len(),
1141            4,
1142            "Expected 4 functions, got {}: {:?}",
1143            result.functions.len(),
1144            result.functions.iter().map(|f| &f.name).collect::<Vec<_>>()
1145        );
1146
1147        // Verify each function is unique
1148        let names: Vec<&str> = result.functions.iter().map(|f| f.name.as_str()).collect();
1149        assert!(names.contains(&"func1"));
1150        assert!(names.contains(&"func2"));
1151        assert!(names.contains(&"func3"));
1152        assert!(names.contains(&"plain_func"));
1153    }
1154
1155    #[test]
1156    fn test_nested_decorated_classes_no_duplicates() {
1157        // Verify overlap detection works for classes with decorators
1158        let source = r#"
1159@dataclass
1160class Point:
1161    x: float
1162    y: float
1163
1164@singleton
1165@validate
1166class Config:
1167    value: str
1168
1169class PlainClass:
1170    pass
1171"#;
1172        let file = create_temp_file(source, ".py");
1173        let result = AstExtractor::extract_file(file.path()).unwrap();
1174
1175        // Should extract exactly 3 classes, no duplicates
1176        assert_eq!(
1177            result.classes.len(),
1178            3,
1179            "Expected 3 classes, got {}: {:?}",
1180            result.classes.len(),
1181            result.classes.iter().map(|c| &c.name).collect::<Vec<_>>()
1182        );
1183
1184        let names: Vec<&str> = result.classes.iter().map(|c| c.name.as_str()).collect();
1185        assert!(names.contains(&"Point"));
1186        assert!(names.contains(&"Config"));
1187        assert!(names.contains(&"PlainClass"));
1188    }
1189
1190    #[test]
1191    fn test_overlap_detection_algorithm() {
1192        // Unit test for the interval overlap logic itself
1193        // Two intervals [start, end) and [s, e) overlap iff start < e AND s < end
1194
1195        // Helper to check overlap
1196        fn overlaps(start: usize, end: usize, s: usize, e: usize) -> bool {
1197            start < e && s < end
1198        }
1199
1200        // Partial overlap: (10, 20) and (15, 25)
1201        assert!(overlaps(10, 20, 15, 25), "Partial overlap should be detected");
1202
1203        // Partial overlap reversed: (15, 25) and (10, 20)
1204        assert!(overlaps(15, 25, 10, 20), "Partial overlap should be detected (reversed)");
1205
1206        // Complete containment: outer contains inner
1207        assert!(overlaps(10, 30, 15, 20), "Containment should be detected");
1208
1209        // Complete containment: inner inside outer
1210        assert!(overlaps(15, 20, 10, 30), "Containment should be detected (reversed)");
1211
1212        // Adjacent but not overlapping: (10, 20) and (20, 30)
1213        assert!(!overlaps(10, 20, 20, 30), "Adjacent intervals should not overlap");
1214
1215        // No overlap: (10, 20) and (25, 30)
1216        assert!(!overlaps(10, 20, 25, 30), "Disjoint intervals should not overlap");
1217
1218        // No overlap reversed: (25, 30) and (10, 20)
1219        assert!(!overlaps(25, 30, 10, 20), "Disjoint intervals should not overlap (reversed)");
1220
1221        // Same interval
1222        assert!(overlaps(10, 20, 10, 20), "Same interval should overlap");
1223
1224        // Edge case: one point overlap at start
1225        assert!(overlaps(10, 20, 19, 25), "Should overlap when ranges share interior point");
1226    }
1227
1228    #[test]
1229    fn test_position_set_deduplication() {
1230        // Unit test for the PositionSet O(log n) deduplication structure
1231
1232        // Test with tolerance of 2 (same as used in extract_functions/extract_classes)
1233        let mut set = PositionSet::with_tolerance(2);
1234
1235        // Empty set should report no duplicates
1236        assert!(!set.is_duplicate(100), "Empty set should have no duplicates");
1237
1238        // Insert first position
1239        set.insert(100);
1240
1241        // Exact match should be duplicate
1242        assert!(set.is_duplicate(100), "Exact position should be duplicate");
1243
1244        // Positions within tolerance should be duplicates
1245        assert!(set.is_duplicate(99), "Position 99 should be duplicate (within tolerance of 100)");
1246        assert!(set.is_duplicate(101), "Position 101 should be duplicate (within tolerance of 100)");
1247        assert!(set.is_duplicate(98), "Position 98 should be duplicate (within tolerance of 100)");
1248        assert!(set.is_duplicate(102), "Position 102 should be duplicate (within tolerance of 100)");
1249
1250        // Positions outside tolerance should NOT be duplicates
1251        assert!(!set.is_duplicate(97), "Position 97 should NOT be duplicate (outside tolerance)");
1252        assert!(!set.is_duplicate(103), "Position 103 should NOT be duplicate (outside tolerance)");
1253        assert!(!set.is_duplicate(50), "Position 50 should NOT be duplicate");
1254        assert!(!set.is_duplicate(200), "Position 200 should NOT be duplicate");
1255
1256        // Insert another position far away
1257        set.insert(500);
1258        assert!(set.is_duplicate(500), "Position 500 should now be duplicate");
1259        assert!(set.is_duplicate(498), "Position 498 should be duplicate (within tolerance of 500)");
1260        assert!(!set.is_duplicate(495), "Position 495 should NOT be duplicate");
1261
1262        // Original positions should still work
1263        assert!(set.is_duplicate(100), "Position 100 should still be duplicate");
1264
1265        // Test edge case: position 0
1266        let mut set2 = PositionSet::with_tolerance(2);
1267        set2.insert(0);
1268        assert!(set2.is_duplicate(0), "Position 0 should be duplicate");
1269        assert!(set2.is_duplicate(1), "Position 1 should be duplicate (within tolerance of 0)");
1270        assert!(set2.is_duplicate(2), "Position 2 should be duplicate (within tolerance of 0)");
1271        assert!(!set2.is_duplicate(3), "Position 3 should NOT be duplicate");
1272
1273        // Test edge case: position 1
1274        set2.insert(1);
1275        // Both 0 and 1 are in the set, so positions 0-3 should all be duplicates
1276        assert!(set2.is_duplicate(0), "Position 0 should be duplicate");
1277        assert!(set2.is_duplicate(3), "Position 3 should be duplicate (within tolerance of 1)");
1278    }
1279
1280    #[test]
1281    fn test_position_set_performance_characteristics() {
1282        // Verify that PositionSet maintains O(log n) characteristics
1283        // by testing with a large number of positions
1284        let mut set = PositionSet::with_tolerance(2);
1285
1286        // Insert 1000 positions (simulating a large file with many functions)
1287        // Each position is 100 bytes apart (typical function spacing)
1288        for i in 0..1000 {
1289            let pos = i * 100;
1290            assert!(!set.is_duplicate(pos), "Position {} should not be duplicate before insert", pos);
1291            set.insert(pos);
1292            assert!(set.is_duplicate(pos), "Position {} should be duplicate after insert", pos);
1293        }
1294
1295        // Verify all positions are still correctly identified
1296        for i in 0..1000 {
1297            let pos = i * 100;
1298            assert!(set.is_duplicate(pos), "Position {} should be duplicate", pos);
1299            assert!(set.is_duplicate(pos + 1), "Position {} should be duplicate (tolerance)", pos + 1);
1300            // Position between functions should not be duplicate
1301            if i < 999 {
1302                assert!(!set.is_duplicate(pos + 50), "Position {} should NOT be duplicate (between functions)", pos + 50);
1303            }
1304        }
1305    }
1306
1307    #[test]
1308    fn test_extract_java_methods_with_fallback() {
1309        // Java uses @method/@constructor captures instead of @function
1310        // This test verifies that the fallback node selection works correctly
1311        let source = r#"
1312public class Calculator {
1313    public int add(int a, int b) {
1314        return a + b;
1315    }
1316
1317    public Calculator() {
1318        // constructor
1319    }
1320
1321    private void helper() {
1322        // helper method
1323    }
1324}
1325"#;
1326        let file = create_temp_file(source, ".java");
1327        let result = AstExtractor::extract_file(file.path()).unwrap();
1328
1329        // Verify class was extracted
1330        assert_eq!(result.classes.len(), 1, "Should extract Calculator class");
1331        let calc = &result.classes[0];
1332        assert_eq!(calc.name, "Calculator");
1333
1334        // Verify methods were extracted (Java extracts methods as part of class)
1335        // The methods should have been extracted correctly even without @function capture
1336        assert!(
1337            calc.methods.len() >= 2,
1338            "Should extract at least 2 methods from Calculator, got {}",
1339            calc.methods.len()
1340        );
1341
1342        // Verify specific method names
1343        let method_names: Vec<&str> = calc.methods.iter().map(|m| m.name.as_str()).collect();
1344        assert!(
1345            method_names.contains(&"add"),
1346            "Should find 'add' method, found: {:?}",
1347            method_names
1348        );
1349    }
1350
1351    #[test]
1352    fn test_extract_go_structs_with_fallback() {
1353        // Go uses @struct/@interface captures instead of @class
1354        // This test verifies that the fallback node selection works correctly
1355        let source = r#"
1356package main
1357
1358type Person struct {
1359    Name string
1360    Age  int
1361}
1362
1363type Speaker interface {
1364    Speak() string
1365}
1366"#;
1367        let file = create_temp_file(source, ".go");
1368        let result = AstExtractor::extract_file(file.path()).unwrap();
1369
1370        // Verify both struct and interface were extracted
1371        assert!(
1372            result.classes.len() >= 2,
1373            "Should extract Person struct and Speaker interface, got {}",
1374            result.classes.len()
1375        );
1376
1377        let names: Vec<&str> = result.classes.iter().map(|c| c.name.as_str()).collect();
1378        assert!(
1379            names.contains(&"Person"),
1380            "Should find Person struct, found: {:?}",
1381            names
1382        );
1383        assert!(
1384            names.contains(&"Speaker"),
1385            "Should find Speaker interface, found: {:?}",
1386            names
1387        );
1388    }
1389
1390    #[test]
1391    fn test_fallback_node_selection_helper_functions() {
1392        // Test the FUNCTION_NODE_KINDS and CLASS_NODE_KINDS constants
1393        // Verify they contain expected values
1394
1395        // Function kinds should include common function node types
1396        assert!(
1397            FUNCTION_NODE_KINDS.contains(&"function_definition"),
1398            "Should contain Python function_definition"
1399        );
1400        assert!(
1401            FUNCTION_NODE_KINDS.contains(&"method_declaration"),
1402            "Should contain Java method_declaration"
1403        );
1404        assert!(
1405            FUNCTION_NODE_KINDS.contains(&"function_item"),
1406            "Should contain Rust function_item"
1407        );
1408        assert!(
1409            FUNCTION_NODE_KINDS.contains(&"arrow_function"),
1410            "Should contain TypeScript arrow_function"
1411        );
1412
1413        // Class kinds should include common class/type node types
1414        assert!(
1415            CLASS_NODE_KINDS.contains(&"class_definition"),
1416            "Should contain Python class_definition"
1417        );
1418        assert!(
1419            CLASS_NODE_KINDS.contains(&"type_declaration"),
1420            "Should contain Go type_declaration"
1421        );
1422        assert!(
1423            CLASS_NODE_KINDS.contains(&"struct_specifier"),
1424            "Should contain C struct_specifier"
1425        );
1426        assert!(
1427            CLASS_NODE_KINDS.contains(&"class_declaration"),
1428            "Should contain Java/TS class_declaration"
1429        );
1430    }
1431
1432    #[test]
1433    fn test_query_caching() {
1434        // Note: Tests run in parallel and share the global cache,
1435        // so we cannot assume the cache is empty at the start.
1436        // Instead, we verify the caching behavior relative to the current state.
1437
1438        // Get baseline cache size (may have entries from other tests)
1439        let _baseline_size = query_cache_stats();
1440
1441        // Extract from a Python file - this will either use cached queries
1442        // or add new entries if Python wasn't processed yet
1443        let source = r#"
1444def hello():
1445    pass
1446
1447class World:
1448    pass
1449"#;
1450        let file = create_temp_file(source, ".py");
1451        let result = AstExtractor::extract_file(file.path()).unwrap();
1452
1453        // Verify extraction worked
1454        assert!(!result.functions.is_empty(), "Should extract at least one function");
1455        assert!(!result.classes.is_empty(), "Should extract at least one class");
1456
1457        // Cache should have at least 2 entries total (function + class queries for at least one language)
1458        let cache_size_after_python = query_cache_stats();
1459        assert!(
1460            cache_size_after_python >= 2,
1461            "Cache should have at least 2 entries (function + class), got {}",
1462            cache_size_after_python
1463        );
1464
1465        // Extract again from the same language - should use cached queries (cache size unchanged)
1466        let source2 = r#"
1467def another():
1468    return 42
1469"#;
1470        let file2 = create_temp_file(source2, ".py");
1471        let result2 = AstExtractor::extract_file(file2.path()).unwrap();
1472        assert!(!result2.functions.is_empty());
1473
1474        // Cache size should remain the same (Python queries were already cached)
1475        assert_eq!(
1476            query_cache_stats(),
1477            cache_size_after_python,
1478            "Cache size should remain the same when reusing same language"
1479        );
1480    }
1481
1482    #[test]
1483    fn test_query_cache_reuse() {
1484        // Test that same-language queries are reused (cache size doesn't increase)
1485        // This is the key property we care about for performance
1486
1487        // Process TypeScript file twice in a row
1488        let ts_source1 = "function greet(): string { return 'hello'; }";
1489        let ts_file1 = create_temp_file(ts_source1, ".ts");
1490        let _ = AstExtractor::extract_file(ts_file1.path()).unwrap();
1491
1492        let size_after_first = query_cache_stats();
1493
1494        // Process another TypeScript file - should reuse cached queries
1495        let ts_source2 = "const add = (a: number, b: number) => a + b;";
1496        let ts_file2 = create_temp_file(ts_source2, ".ts");
1497        let ts_result = AstExtractor::extract_file(ts_file2.path()).unwrap();
1498        assert!(!ts_result.functions.is_empty(), "Should extract TypeScript function");
1499
1500        let size_after_second = query_cache_stats();
1501
1502        // Cache size should not increase when processing same language
1503        assert_eq!(
1504            size_after_first, size_after_second,
1505            "Cache size should remain the same when reprocessing same language"
1506        );
1507    }
1508
1509    #[test]
1510    fn test_query_cache_thread_safety() {
1511        // Test that cache operations are thread-safe under concurrent access
1512        // Note: We don't clear the cache since it interferes with parallel tests
1513        use std::thread;
1514
1515        let handles: Vec<_> = (0..4)
1516            .map(|i| {
1517                thread::spawn(move || {
1518                    let source = format!(
1519                        r#"
1520def func_{}():
1521    pass
1522"#,
1523                        i
1524                    );
1525                    let file = create_temp_file(&source, ".py");
1526                    let result = AstExtractor::extract_file(file.path());
1527                    assert!(result.is_ok(), "Extraction should succeed in thread {}", i);
1528                })
1529            })
1530            .collect();
1531
1532        for handle in handles {
1533            handle.join().expect("Thread should complete successfully");
1534        }
1535
1536        // After concurrent access, cache should have entries
1537        let cache_size = query_cache_stats();
1538        assert!(
1539            cache_size >= 2,
1540            "Cache should have entries after concurrent access, got {}",
1541            cache_size
1542        );
1543    }
1544
1545    // =========================================================================
1546    // Parser Caching Tests
1547    // =========================================================================
1548
1549    #[test]
1550    fn test_parser_caching_basic() {
1551        // Parser cache is thread-local, so we can test it in isolation
1552        // Note: Other tests in the same thread may have cached parsers
1553
1554        // Process multiple Python files - should reuse cached parser
1555        let source1 = r#"
1556def hello():
1557    pass
1558"#;
1559        let source2 = r#"
1560def world():
1561    return 42
1562"#;
1563        let file1 = create_temp_file(source1, ".py");
1564        let file2 = create_temp_file(source2, ".py");
1565
1566        // Both extractions should succeed and use the cached parser
1567        let result1 = AstExtractor::extract_file(file1.path());
1568        assert!(result1.is_ok(), "First extraction should succeed");
1569
1570        let result2 = AstExtractor::extract_file(file2.path());
1571        assert!(result2.is_ok(), "Second extraction should succeed (using cached parser)");
1572
1573        // Verify parsing was correct
1574        assert!(!result1.unwrap().functions.is_empty());
1575        assert!(!result2.unwrap().functions.is_empty());
1576    }
1577
1578    #[test]
1579    fn test_parser_caching_multiple_languages() {
1580        // Test that parsers for different languages are cached separately
1581
1582        let py_source = "def hello(): pass";
1583        let ts_source = "function hello(): void {}";
1584        let go_source = "package main\nfunc hello() {}";
1585
1586        let py_file = create_temp_file(py_source, ".py");
1587        let ts_file = create_temp_file(ts_source, ".ts");
1588        let go_file = create_temp_file(go_source, ".go");
1589
1590        // Extract from multiple languages
1591        let py_result = AstExtractor::extract_file(py_file.path());
1592        let ts_result = AstExtractor::extract_file(ts_file.path());
1593        let go_result = AstExtractor::extract_file(go_file.path());
1594
1595        // All should succeed
1596        assert!(py_result.is_ok(), "Python extraction should succeed");
1597        assert!(ts_result.is_ok(), "TypeScript extraction should succeed");
1598        assert!(go_result.is_ok(), "Go extraction should succeed");
1599
1600        // Verify correct language was detected
1601        assert_eq!(py_result.unwrap().language, "python");
1602        assert_eq!(ts_result.unwrap().language, "typescript");
1603        assert_eq!(go_result.unwrap().language, "go");
1604
1605        // Parser cache should have entries for each language
1606        let cache_size = parser_cache_stats();
1607        assert!(
1608            cache_size >= 3,
1609            "Cache should have at least 3 parsers (one per language), got {}",
1610            cache_size
1611        );
1612    }
1613
1614    #[test]
1615    fn test_parser_caching_extract_from_source() {
1616        // Test that extract_from_source also uses cached parsers
1617
1618        let source1 = "def foo(): pass";
1619        let source2 = "def bar(): return 1";
1620
1621        let result1 = AstExtractor::extract_from_source(source1, "python");
1622        let result2 = AstExtractor::extract_from_source(source2, "python");
1623
1624        assert!(result1.is_ok(), "First extract_from_source should succeed");
1625        assert!(result2.is_ok(), "Second extract_from_source should succeed (cached)");
1626
1627        assert_eq!(result1.unwrap().functions[0].name, "foo");
1628        assert_eq!(result2.unwrap().functions[0].name, "bar");
1629    }
1630
1631    #[test]
1632    fn test_parser_cache_clear() {
1633        // Test that clearing the parser cache works
1634
1635        // Populate cache with a parser
1636        let source = "def test(): pass";
1637        let file = create_temp_file(source, ".py");
1638        let _ = AstExtractor::extract_file(file.path()).unwrap();
1639
1640        // Cache should have at least one parser
1641        let before_clear = parser_cache_stats();
1642        assert!(before_clear >= 1, "Cache should have at least 1 parser before clear");
1643
1644        // Clear the cache
1645        clear_parser_cache();
1646
1647        // Cache should be empty
1648        let after_clear = parser_cache_stats();
1649        assert_eq!(after_clear, 0, "Cache should be empty after clear");
1650
1651        // Extraction should still work (creates new parser)
1652        let source2 = "def another(): pass";
1653        let file2 = create_temp_file(source2, ".py");
1654        let result = AstExtractor::extract_file(file2.path());
1655        assert!(result.is_ok(), "Extraction should work after cache clear");
1656
1657        // Cache should have one parser again
1658        let after_extraction = parser_cache_stats();
1659        assert_eq!(after_extraction, 1, "Cache should have 1 parser after extraction");
1660    }
1661
1662    #[test]
1663    fn test_parser_cache_thread_local() {
1664        // Test that parser cache is truly thread-local (each thread has its own cache)
1665        use std::thread;
1666
1667        // Clear cache in main thread
1668        clear_parser_cache();
1669
1670        // Populate cache in main thread
1671        let source = "def main_thread(): pass";
1672        let file = create_temp_file(source, ".py");
1673        let _ = AstExtractor::extract_file(file.path()).unwrap();
1674
1675        let main_thread_cache = parser_cache_stats();
1676        assert!(main_thread_cache >= 1, "Main thread cache should have parser");
1677
1678        // Spawn a new thread and check its cache
1679        let handle = thread::spawn(|| {
1680            // New thread should have empty cache
1681            let child_cache_before = parser_cache_stats();
1682
1683            // Parse something in the child thread
1684            let source = "def child_thread(): pass";
1685            let file = create_temp_file(source, ".py");
1686            let _ = AstExtractor::extract_file(file.path()).unwrap();
1687
1688            let child_cache_after = parser_cache_stats();
1689
1690            (child_cache_before, child_cache_after)
1691        });
1692
1693        let (child_before, child_after) = handle.join().unwrap();
1694
1695        // Child thread should have started with empty cache (thread-local)
1696        assert_eq!(
1697            child_before, 0,
1698            "Child thread should start with empty cache"
1699        );
1700        assert!(
1701            child_after >= 1,
1702            "Child thread should have parser after extraction"
1703        );
1704
1705        // Main thread cache should be unchanged
1706        let main_thread_cache_after = parser_cache_stats();
1707        assert_eq!(
1708            main_thread_cache, main_thread_cache_after,
1709            "Main thread cache should be unchanged by child thread"
1710        );
1711    }
1712}