Skip to main content

normalize_facts/
extract.rs

1//! Shared symbol extraction from source code.
2//!
3//! This module provides the core AST traversal logic for extracting
4//! symbols, imports, and other facts from source files.
5//!
6//! ## Extraction paths
7//!
8//! Symbol extraction uses the tags path: a `*.tags.scm` query is run against the
9//! tree-sitter parse tree. The query groups `@definition.*` + `@name` captures per
10//! match, reconstructs nesting by line-range containment, and returns a `Vec<Symbol>`.
11//!
12//! After extraction, post-processing steps apply (Rust impl-block merging,
13//! TypeScript/JavaScript interface marking).
14
15use crate::ca_cache;
16use crate::parsers;
17use normalize_facts_core::SymbolKind;
18use normalize_languages::{Language, Symbol, Visibility, support_for_path};
19use std::path::Path;
20use streaming_iterator::StreamingIterator;
21use tree_sitter;
22
23/// Result of extracting symbols from a file.
24pub struct ExtractResult {
25    /// Top-level symbols (nested structure preserved)
26    pub symbols: Vec<Symbol>,
27    /// File path for context
28    pub file_path: String,
29}
30
31impl ExtractResult {
32    /// Filter to only type definitions (class, struct, enum, trait, interface)
33    /// Returns a new ExtractResult with only type-like symbols, and strips methods from classes
34    pub fn filter_types(&self) -> ExtractResult {
35        use normalize_facts_core::SymbolKind;
36
37        fn is_type_kind(kind: SymbolKind) -> bool {
38            matches!(
39                kind,
40                SymbolKind::Class
41                    | SymbolKind::Struct
42                    | SymbolKind::Enum
43                    | SymbolKind::Trait
44                    | SymbolKind::Interface
45                    | SymbolKind::Type
46                    | SymbolKind::Module
47            )
48        }
49
50        fn filter_symbol(sym: &Symbol) -> Option<Symbol> {
51            if is_type_kind(sym.kind) {
52                // For types, keep only nested types (not methods)
53                let type_children: Vec<_> = sym.children.iter().filter_map(filter_symbol).collect();
54                Some(Symbol {
55                    name: sym.name.clone(),
56                    kind: sym.kind,
57                    signature: sym.signature.clone(),
58                    docstring: sym.docstring.clone(),
59                    attributes: Vec::new(),
60                    start_line: sym.start_line,
61                    end_line: sym.end_line,
62                    visibility: sym.visibility,
63                    children: type_children,
64                    is_interface_impl: sym.is_interface_impl,
65                    implements: sym.implements.clone(),
66                })
67            } else {
68                None
69            }
70        }
71
72        let filtered_symbols: Vec<_> = self.symbols.iter().filter_map(filter_symbol).collect();
73
74        ExtractResult {
75            symbols: filtered_symbols,
76            file_path: self.file_path.clone(),
77        }
78    }
79
80    /// Filter out test functions and test modules.
81    /// Uses Language::is_test_symbol() for language-specific detection.
82    pub fn filter_tests(&self) -> ExtractResult {
83        use normalize_languages::support_for_path;
84        use std::path::Path;
85
86        let lang = support_for_path(Path::new(&self.file_path));
87
88        fn filter_symbol(sym: &Symbol, lang: Option<&dyn Language>) -> Option<Symbol> {
89            let is_test = match lang {
90                Some(l) => l.is_test_symbol(sym),
91                None => false, // Unknown language: keep everything
92            };
93            if is_test {
94                return None;
95            }
96            let filtered_children: Vec<_> = sym
97                .children
98                .iter()
99                .filter_map(|c| filter_symbol(c, lang))
100                .collect();
101            Some(Symbol {
102                name: sym.name.clone(),
103                kind: sym.kind,
104                signature: sym.signature.clone(),
105                docstring: sym.docstring.clone(),
106                attributes: sym.attributes.clone(),
107                start_line: sym.start_line,
108                end_line: sym.end_line,
109                visibility: sym.visibility,
110                children: filtered_children,
111                is_interface_impl: sym.is_interface_impl,
112                implements: sym.implements.clone(),
113            })
114        }
115
116        let lang_ref: Option<&dyn Language> = lang.map(|l| l as &dyn Language);
117        let filtered_symbols: Vec<_> = self
118            .symbols
119            .iter()
120            .filter_map(|s| filter_symbol(s, lang_ref))
121            .collect();
122
123        ExtractResult {
124            symbols: filtered_symbols,
125            file_path: self.file_path.clone(),
126        }
127    }
128}
129
130/// Options for symbol extraction.
131#[derive(Clone)]
132pub struct ExtractOptions {
133    /// Include private/non-public symbols (default: true for code exploration)
134    pub include_private: bool,
135}
136
137impl Default for ExtractOptions {
138    fn default() -> Self {
139        Self {
140            // Default to including all symbols - normalize is for code exploration,
141            // not API documentation. This ensures trait impl methods are visible.
142            include_private: true,
143        }
144    }
145}
146
147/// Resolver for cross-file interface method lookups.
148/// Used to find interface/class method signatures from other files.
149pub trait InterfaceResolver {
150    /// Get method names for an interface/class by name.
151    /// Returns None if the interface cannot be resolved (external, missing, etc.).
152    fn resolve_interface_methods(&self, name: &str, current_file: &str) -> Option<Vec<String>>;
153}
154
155/// Resolver that parses files on-demand for cross-file interface lookups.
156/// This is the fallback when no index is available.
157pub struct OnDemandResolver<'a> {
158    root: &'a std::path::Path,
159}
160
161impl<'a> OnDemandResolver<'a> {
162    pub fn new(root: &'a std::path::Path) -> Self {
163        Self { root }
164    }
165}
166
167impl InterfaceResolver for OnDemandResolver<'_> {
168    fn resolve_interface_methods(&self, name: &str, current_file: &str) -> Option<Vec<String>> {
169        use normalize_languages::support_for_path;
170
171        let current_path = std::path::Path::new(current_file);
172        let current_dir = current_path.parent()?;
173
174        // Try common patterns for interface files
175        // This is a heuristic - we check nearby files that might contain the interface
176        let candidates = [
177            "types.ts",
178            "interfaces.ts",
179            "index.ts",
180            "../types.ts",
181            "../interfaces.ts",
182            "../index.ts",
183        ];
184
185        for candidate in candidates {
186            let candidate_path = if candidate.starts_with("..") {
187                current_dir.parent()?.join(&candidate[3..])
188            } else {
189                current_dir.join(candidate)
190            };
191
192            // Try with root prefix
193            let full_path = self.root.join(&candidate_path);
194            if !full_path.exists() {
195                continue;
196            }
197
198            let content = std::fs::read_to_string(&full_path).ok()?;
199            // Verify it's a supported file type
200            let _support = support_for_path(&full_path)?;
201
202            // Parse the file and look for the interface
203            let extractor = Extractor::new();
204            // Don't use resolver here to avoid recursion
205            let result = extractor.extract(&full_path, &content);
206
207            for sym in &result.symbols {
208                if sym.name == name
209                    && matches!(
210                        sym.kind,
211                        normalize_languages::SymbolKind::Interface
212                            | normalize_languages::SymbolKind::Class
213                    )
214                {
215                    let methods: Vec<String> = sym
216                        .children
217                        .iter()
218                        .filter(|c| {
219                            matches!(
220                                c.kind,
221                                normalize_languages::SymbolKind::Method
222                                    | normalize_languages::SymbolKind::Function
223                            )
224                        })
225                        .map(|c| c.name.clone())
226                        .collect();
227                    if !methods.is_empty() {
228                        return Some(methods);
229                    }
230                }
231            }
232        }
233
234        None
235    }
236}
237
238/// Shared symbol extractor using the Language trait.
239pub struct Extractor {
240    options: ExtractOptions,
241}
242
243impl Default for Extractor {
244    fn default() -> Self {
245        Self::new()
246    }
247}
248
249impl Extractor {
250    pub fn new() -> Self {
251        Self {
252            options: ExtractOptions::default(),
253        }
254    }
255
256    pub fn with_options(options: ExtractOptions) -> Self {
257        Self { options }
258    }
259
260    /// Extract symbols from a file.
261    pub fn extract(&self, path: &Path, content: &str) -> ExtractResult {
262        self.extract_with_resolver(path, content, None)
263    }
264
265    /// Extract symbols from a file with optional cross-file interface resolution.
266    pub fn extract_with_resolver(
267        &self,
268        path: &Path,
269        content: &str,
270        resolver: Option<&dyn InterfaceResolver>,
271    ) -> ExtractResult {
272        let file_path = path.to_string_lossy().to_string();
273        let symbols = match support_for_path(path) {
274            Some(support) => self.extract_with_support(content, support, resolver, &file_path),
275            None => Vec::new(),
276        };
277
278        ExtractResult { symbols, file_path }
279    }
280
281    fn extract_with_support(
282        &self,
283        content: &str,
284        support: &dyn Language,
285        resolver: Option<&dyn InterfaceResolver>,
286        current_file: &str,
287    ) -> Vec<Symbol> {
288        let grammar_name = support.grammar_name();
289
290        // Cache version encodes the extraction schema and include_private flag.
291        // Bump this string whenever the Symbol struct, post-processing, or query
292        // semantics change in a way that invalidates existing cached results.
293        // Cross-file resolver results are not cached (resolver.is_none() guard below).
294        let cache_ver = if self.options.include_private {
295            "symbols-v1-all"
296        } else {
297            "symbols-v1-public"
298        };
299
300        // Check the persistent symbol cache before parsing (only when no cross-file
301        // resolver is involved, as resolver results depend on other files).
302        if resolver.is_none()
303            && let Some(cache) = ca_cache::symbol_cache()
304        {
305            let hash = blake3::hash(content.as_bytes());
306            match cache.get::<Vec<Symbol>>(hash.as_bytes(), cache_ver, grammar_name) {
307                Ok(Some(cached)) => return cached,
308                Ok(None) => {} // cache miss — fall through to parse
309                Err(e) => {
310                    tracing::debug!(
311                        "normalize-facts: symbol cache get error for {}: {}",
312                        current_file,
313                        e
314                    );
315                }
316            }
317        }
318
319        let tree = match parsers::parse_with_grammar(grammar_name, content) {
320            Some(t) => t,
321            None => return Vec::new(),
322        };
323
324        // Use the tags-based extraction path with cached compiled queries.
325        let loader = parsers::grammar_loader();
326        let mut symbols = if let Some(tags_query_str) = loader.get_tags(grammar_name) {
327            loader
328                .get_compiled_query(grammar_name, "tags", &tags_query_str)
329                .and_then(|query| {
330                    collect_symbols_from_tags(
331                        &tree,
332                        &query,
333                        content,
334                        support,
335                        self.options.include_private,
336                    )
337                })
338                .unwrap_or_default()
339        } else {
340            Vec::new()
341        };
342
343        // Post-process for Rust: merge impl blocks with their types
344        if grammar_name == "rust" {
345            Self::merge_rust_impl_blocks(&mut symbols);
346        }
347
348        // Post-process for Haskell: deduplicate functions by name (multi-equation definitions
349        // produce one `function` node per equation, each with the same name field).
350        if grammar_name == "haskell" {
351            Self::dedup_haskell_functions(&mut symbols);
352        }
353
354        // Post-process for TypeScript/JavaScript: mark interface implementations
355        if grammar_name == "typescript" || grammar_name == "javascript" {
356            Self::mark_interface_implementations(&mut symbols, resolver, current_file);
357        }
358
359        // Store in the persistent symbol cache (only when no cross-file resolver
360        // was used, so the result is fully content-addressed).
361        if resolver.is_none()
362            && let Some(cache) = ca_cache::symbol_cache()
363        {
364            let hash = blake3::hash(content.as_bytes());
365            if let Err(e) = cache.put(hash.as_bytes(), cache_ver, grammar_name, &symbols) {
366                tracing::debug!(
367                    "normalize-facts: symbol cache put error for {}: {}",
368                    current_file,
369                    e
370                );
371            }
372        }
373
374        symbols
375    }
376
377    /// Deduplicate Haskell function symbols by name.
378    ///
379    /// Haskell allows multi-equation function definitions where each equation is a
380    /// separate top-level declaration. The tree-sitter-haskell grammar produces one
381    /// `function` node per equation, each with the same `name` field. The tags query
382    /// therefore captures the same function name once per equation. This pass keeps
383    /// only the first occurrence of each (name, kind) pair at the top level, and merges
384    /// the byte ranges by extending the first occurrence's `end_line` to cover all
385    /// equations (so the symbol spans the complete definition).
386    fn dedup_haskell_functions(symbols: &mut Vec<Symbol>) {
387        // Use a Vec<(name, kind)> for seen tracking since SymbolKind doesn't derive Hash.
388        let mut seen: Vec<(String, SymbolKind)> = Vec::new();
389        let mut i = 0;
390        while i < symbols.len() {
391            let key = (symbols[i].name.clone(), symbols[i].kind);
392            if seen.contains(&key) {
393                symbols.remove(i);
394            } else {
395                seen.push(key);
396                i += 1;
397            }
398        }
399    }
400
401    /// Merge Rust impl blocks with their corresponding struct/enum types
402    fn merge_rust_impl_blocks(symbols: &mut Vec<Symbol>) {
403        use std::collections::HashMap;
404
405        // Collect impl blocks: their children and implements lists
406        let mut impl_methods: HashMap<String, Vec<Symbol>> = HashMap::new();
407        let mut impl_implements: HashMap<String, Vec<String>> = HashMap::new();
408
409        // Remove impl blocks and collect their methods + implements
410        symbols.retain(|sym| {
411            if sym.signature.starts_with("impl ") {
412                impl_methods
413                    .entry(sym.name.clone())
414                    .or_default()
415                    .extend(sym.children.clone());
416                impl_implements
417                    .entry(sym.name.clone())
418                    .or_default()
419                    .extend(sym.implements.clone());
420                return false;
421            }
422            true
423        });
424
425        // Add methods and implements to matching struct/enum
426        for sym in symbols.iter_mut() {
427            if matches!(
428                sym.kind,
429                normalize_languages::SymbolKind::Struct | normalize_languages::SymbolKind::Enum
430            ) {
431                if let Some(methods) = impl_methods.remove(&sym.name) {
432                    sym.children.extend(methods);
433                }
434                if let Some(impls) = impl_implements.remove(&sym.name) {
435                    sym.implements.extend(impls);
436                }
437            }
438        }
439
440        // Any remaining impl blocks without matching type: add back
441        for (name, methods) in impl_methods {
442            let impls = impl_implements.remove(&name).unwrap_or_default();
443            if !methods.is_empty() {
444                symbols.push(Symbol {
445                    name: name.clone(),
446                    kind: normalize_languages::SymbolKind::Module, // impl as module-like
447                    signature: format!("impl {}", name),
448                    docstring: None,
449                    attributes: Vec::new(),
450                    start_line: methods.first().map(|m| m.start_line).unwrap_or(0),
451                    end_line: methods.last().map(|m| m.end_line).unwrap_or(0),
452                    visibility: Visibility::Public,
453                    children: methods,
454                    is_interface_impl: !impls.is_empty(),
455                    implements: impls,
456                });
457            }
458        }
459    }
460
461    /// Mark methods that implement interfaces (for TypeScript/JavaScript).
462    /// Builds a map of interface/class names to their method names,
463    /// then marks matching methods in classes that extend/implement them.
464    ///
465    /// If a resolver is provided, it will be used to look up interface methods
466    /// from other files when not found locally.
467    fn mark_interface_implementations(
468        symbols: &mut [Symbol],
469        resolver: Option<&dyn InterfaceResolver>,
470        current_file: &str,
471    ) {
472        use std::collections::{HashMap, HashSet};
473
474        // First pass: collect method names from interfaces and classes in this file
475        // (these could be parent classes that get extended)
476        let mut type_methods: HashMap<String, HashSet<String>> = HashMap::new();
477
478        fn collect_type_methods(
479            symbols: &[Symbol],
480            type_methods: &mut HashMap<String, HashSet<String>>,
481        ) {
482            for sym in symbols {
483                if matches!(
484                    sym.kind,
485                    normalize_languages::SymbolKind::Interface
486                        | normalize_languages::SymbolKind::Class
487                ) {
488                    let methods: HashSet<String> = sym
489                        .children
490                        .iter()
491                        .filter(|c| {
492                            matches!(
493                                c.kind,
494                                normalize_languages::SymbolKind::Method
495                                    | normalize_languages::SymbolKind::Function
496                            )
497                        })
498                        .map(|c| c.name.clone())
499                        .collect();
500                    if !methods.is_empty() {
501                        type_methods.insert(sym.name.clone(), methods);
502                    }
503                }
504                // Recurse into nested types
505                collect_type_methods(&sym.children, type_methods);
506            }
507        }
508
509        collect_type_methods(symbols, &mut type_methods);
510
511        // Cache for cross-file resolved interfaces (avoid repeated lookups)
512        let mut cross_file_cache: HashMap<String, Option<HashSet<String>>> = HashMap::new();
513
514        // Second pass: mark methods in classes that implement/extend
515        fn mark_methods(
516            symbols: &mut [Symbol],
517            type_methods: &HashMap<String, HashSet<String>>,
518            resolver: Option<&dyn InterfaceResolver>,
519            current_file: &str,
520            cross_file_cache: &mut HashMap<String, Option<HashSet<String>>>,
521        ) {
522            for sym in symbols.iter_mut() {
523                if !sym.implements.is_empty() {
524                    // Collect all method names from all implemented interfaces/parents
525                    let mut interface_methods: HashSet<String> = HashSet::new();
526
527                    for parent_name in &sym.implements {
528                        // Try same-file first
529                        if let Some(methods) = type_methods.get(parent_name) {
530                            interface_methods.extend(methods.clone());
531                        } else if let Some(resolver) = resolver {
532                            // Try cross-file resolution with caching
533                            let cached = cross_file_cache
534                                .entry(parent_name.clone())
535                                .or_insert_with(|| {
536                                    resolver
537                                        .resolve_interface_methods(parent_name, current_file)
538                                        .map(|v| v.into_iter().collect())
539                                });
540                            if let Some(methods) = cached {
541                                interface_methods.extend(methods.clone());
542                            }
543                        }
544                    }
545
546                    // Mark matching methods
547                    for child in &mut sym.children {
548                        if matches!(
549                            child.kind,
550                            normalize_languages::SymbolKind::Method
551                                | normalize_languages::SymbolKind::Function
552                        ) && interface_methods.contains(&child.name)
553                        {
554                            child.is_interface_impl = true;
555                        }
556                    }
557                }
558                // Recurse
559                mark_methods(
560                    &mut sym.children,
561                    type_methods,
562                    resolver,
563                    current_file,
564                    cross_file_cache,
565                );
566            }
567        }
568
569        mark_methods(
570            symbols,
571            &type_methods,
572            resolver,
573            current_file,
574            &mut cross_file_cache,
575        );
576    }
577}
578
579/// Map a `@definition.*` capture name to a `SymbolKind`.
580///
581/// Returns `None` for capture names that are not definitions (e.g., `reference.call`),
582/// which should be ignored during symbol extraction.
583fn tags_capture_to_kind(capture_name: &str) -> Option<SymbolKind> {
584    match capture_name {
585        "definition.function" => Some(SymbolKind::Function),
586        // Methods are tagged as Function here; they get re-classified to Method
587        // once we reconstruct nesting (children of containers become methods).
588        "definition.method" => Some(SymbolKind::Function),
589        "definition.class" => Some(SymbolKind::Class),
590        "definition.interface" => Some(SymbolKind::Interface),
591        "definition.module" => Some(SymbolKind::Module),
592        "definition.type" => Some(SymbolKind::Type),
593        "definition.enum" => Some(SymbolKind::Enum),
594        "definition.heading" => Some(SymbolKind::Heading),
595        // No Macro variant — map to Function (closest semantic equivalent)
596        "definition.macro" => Some(SymbolKind::Function),
597        "definition.constant" => Some(SymbolKind::Constant),
598        "definition.var" => Some(SymbolKind::Variable),
599        _ => None,
600    }
601}
602
603/// Whether a `SymbolKind` is a container that can hold child symbols.
604fn is_container_kind(kind: SymbolKind) -> bool {
605    matches!(
606        kind,
607        SymbolKind::Class
608            | SymbolKind::Interface
609            | SymbolKind::Module
610            | SymbolKind::Enum
611            | SymbolKind::Heading
612    )
613}
614
615/// Intermediate record built from a single tags-query match before nesting reconstruction.
616/// Retains the node ID so we can call Language trait methods on the correct node.
617struct TagDef<'tree> {
618    /// The definition AST node (e.g. function_item, class_definition).
619    node: tree_sitter::Node<'tree>,
620    /// `SymbolKind` derived from the `@definition.*` capture name.
621    kind: SymbolKind,
622    /// True when the capture name was `definition.method` (explicit method tag).
623    is_method_capture: bool,
624    /// True when the capture name identifies a container kind (class/interface/module).
625    is_container: bool,
626    /// Line numbers (1-indexed) of the definition node.
627    start_line: usize,
628    end_line: usize,
629}
630
631/// Build a `Symbol` from a single `TagDef` using the Language semantic hooks.
632fn build_symbol_from_def<'tree>(
633    def: &TagDef<'tree>,
634    content: &str,
635    support: &dyn Language,
636    in_container: bool,
637) -> Option<Symbol> {
638    let name = support.node_name(&def.node, content)?;
639    let tag_kind = support.refine_kind(&def.node, content, def.kind);
640    let kind =
641        if def.is_method_capture || (in_container && matches!(tag_kind, SymbolKind::Function)) {
642            SymbolKind::Method
643        } else {
644            tag_kind
645        };
646    let implements_info = if def.is_container {
647        support.extract_implements(&def.node, content)
648    } else {
649        normalize_languages::ImplementsInfo::default()
650    };
651    Some(Symbol {
652        name: name.to_string(),
653        kind,
654        signature: support.build_signature(&def.node, content),
655        docstring: support.extract_docstring(&def.node, content),
656        attributes: support.extract_attributes(&def.node, content),
657        start_line: def.node.start_position().row + 1,
658        end_line: def.node.end_position().row + 1,
659        visibility: support.get_visibility(&def.node, content),
660        children: Vec::new(),
661        is_interface_impl: implements_info.is_interface,
662        implements: implements_info.implements,
663    })
664}
665
666/// Extract symbols from a parsed tree using a tags query.
667///
668/// Uses the tags query for *node classification* (which nodes are which kind of def),
669/// then calls the Language semantic hooks on those nodes for symbol content
670/// (name, signature, visibility, docstring, implements, attributes, etc.).
671///
672/// Nesting is reconstructed by line-range containment: a def whose line range is
673/// fully enclosed by a container def is placed as a child of that container.
674///
675/// Returns `None` if the query produces no definition matches (caller falls back to
676/// the trait path).
677fn collect_symbols_from_tags<'tree>(
678    tree: &'tree tree_sitter::Tree,
679    query: &tree_sitter::Query,
680    content: &str,
681    support: &dyn Language,
682    include_private: bool,
683) -> Option<Vec<Symbol>> {
684    let capture_names = query.capture_names();
685
686    // We require a @name capture to be present in the query.
687    let name_idx = capture_names.iter().position(|n| *n == "name")?;
688    let _ = name_idx; // present but not needed — definition node gives us position
689
690    // Run the query and collect TagDef records.
691    let root = tree.root_node();
692    let mut qcursor = tree_sitter::QueryCursor::new();
693    let mut matches = qcursor.matches(query, root, content.as_bytes());
694
695    let mut defs: Vec<TagDef<'tree>> = Vec::new();
696
697    while let Some(m) = matches.next() {
698        // Each match should contain a @definition.* capture.
699        // We skip matches that have no definition capture (e.g. pure reference matches).
700        let mut def_capture: Option<(tree_sitter::Node<'tree>, &str)> = None;
701
702        for capture in m.captures {
703            let cn = &capture_names[capture.index as usize];
704            if tags_capture_to_kind(cn).is_some() {
705                // SAFETY: The tree lives as long as 'tree; captures borrow from it.
706                let node = capture.node;
707                def_capture = Some((node, cn));
708            }
709        }
710
711        let Some((def_node, capture_name)) = def_capture else {
712            continue;
713        };
714        let kind = match tags_capture_to_kind(capture_name) {
715            Some(k) => k,
716            None => continue,
717        };
718
719        // Apply language-specific kind refinement before determining container status,
720        // so languages like JSON can promote Variable → Module for object-valued pairs.
721        let refined_kind = support.refine_kind(&def_node, content, kind);
722        defs.push(TagDef {
723            node: def_node,
724            kind,
725            is_method_capture: capture_name == "definition.method",
726            is_container: is_container_kind(refined_kind),
727            start_line: def_node.start_position().row + 1,
728            end_line: def_node.end_position().row + 1,
729        });
730    }
731
732    if defs.is_empty() {
733        return None;
734    }
735
736    // Sort by start line, with outer containers before inner defs at the same line.
737    defs.sort_by(|a, b| {
738        a.start_line
739            .cmp(&b.start_line)
740            .then(b.end_line.cmp(&a.end_line))
741    });
742
743    // De-duplicate: remove defs with identical byte ranges.
744    // Some tags queries match the same node multiple times (e.g. both a generic and
745    // a specific pattern). Keep the first (which has the most specific kind after sorting).
746    defs.dedup_by(|b, a| {
747        a.node.start_byte() == b.node.start_byte() && a.node.end_byte() == b.node.end_byte()
748    });
749
750    // Container indices (for nesting reconstruction).
751    let container_idxs: Vec<usize> = (0..defs.len()).filter(|&i| defs[i].is_container).collect();
752
753    // Two-phase assembly: first build all symbols with parent info, then assemble tree.
754    // This supports arbitrary nesting depth (namespaces > classes > methods, or
755    // deeply nested data format keys).
756
757    // Phase 1: Build symbols and record parent relationships.
758    // symbols[i] corresponds to defs[i] (None if skipped due to visibility).
759    let mut symbols: Vec<Option<Symbol>> = Vec::with_capacity(defs.len());
760    let mut parent_of: Vec<Option<usize>> = Vec::with_capacity(defs.len()); // def_idx → parent def_idx
761
762    for i in 0..defs.len() {
763        let def = &defs[i];
764
765        let enclosing_ci = container_idxs
766            .iter()
767            .filter(|&&ci| ci != i)
768            .rev()
769            .find(|&&ci| {
770                let c = &defs[ci];
771                c.start_line <= def.start_line && c.end_line >= def.end_line
772            });
773
774        let in_container = enclosing_ci.is_some();
775
776        let Some(mut sym) = build_symbol_from_def(def, content, support, in_container) else {
777            symbols.push(None);
778            parent_of.push(None);
779            continue;
780        };
781
782        if !include_private
783            && matches!(
784                sym.visibility,
785                Visibility::Private | Visibility::Protected | Visibility::Internal
786            )
787        {
788            symbols.push(None);
789            parent_of.push(None);
790            continue;
791        }
792
793        if def.is_container {
794            sym.children.clear();
795        }
796
797        symbols.push(Some(sym));
798        parent_of.push(enclosing_ci.copied());
799    }
800
801    // Phase 2: Assemble tree bottom-up. Process in reverse order so children are
802    // moved into their parents before the parent is moved into its parent.
803    // We use Vec<Option<Symbol>> so we can take ownership via .take().
804    for i in (0..symbols.len()).rev() {
805        if let Some(pi) = parent_of[i]
806            && symbols[pi].is_some()
807            && symbols[i].is_some()
808        {
809            let child = symbols[i].take().unwrap();
810            symbols[pi].as_mut().unwrap().children.push(child);
811        }
812    }
813
814    // Collect remaining top-level symbols (those not moved into a parent).
815    // Reverse children since we assembled bottom-up.
816    let mut top_level: Vec<Symbol> = Vec::new();
817    for sym_opt in &mut symbols {
818        if let Some(mut sym) = sym_opt.take() {
819            sym.children.reverse();
820            reverse_children_recursive(&mut sym.children);
821            top_level.push(sym);
822        }
823    }
824
825    if top_level.is_empty() {
826        None
827    } else {
828        Some(top_level)
829    }
830}
831
832/// Recursively reverse children vectors (needed because bottom-up assembly reverses order).
833fn reverse_children_recursive(children: &mut [Symbol]) {
834    for child in children.iter_mut() {
835        child.children.reverse();
836        reverse_children_recursive(&mut child.children);
837    }
838}
839
840/// Compute cyclomatic complexity for a function node using the `.complexity.scm` query.
841/// Returns 1 (base complexity) for languages without a complexity query.
842pub fn compute_complexity(
843    node: &tree_sitter::Node,
844    support: &dyn Language,
845    source: &[u8],
846) -> usize {
847    let grammar_name = support.grammar_name();
848    let loader = parsers::grammar_loader();
849    if let Some(scm) = loader.get_complexity(grammar_name)
850        && let Some(query) = loader.get_compiled_query(grammar_name, "complexity", &scm)
851    {
852        return count_complexity_with_query(node, source, &query);
853    }
854    1
855}
856
857/// Count complexity using a `@complexity` query.
858fn count_complexity_with_query(
859    node: &tree_sitter::Node,
860    source: &[u8],
861    query: &tree_sitter::Query,
862) -> usize {
863    let complexity_idx = query
864        .capture_names()
865        .iter()
866        .position(|n| *n == "complexity");
867    let Some(complexity_idx) = complexity_idx else {
868        return 1;
869    };
870
871    let mut qcursor = tree_sitter::QueryCursor::new();
872    qcursor.set_byte_range(node.byte_range());
873    let mut complexity = 1usize;
874    let mut matches = qcursor.matches(query, *node, source);
875    while let Some(m) = matches.next() {
876        for capture in m.captures {
877            if capture.index as usize == complexity_idx {
878                complexity += 1;
879            }
880        }
881    }
882    complexity
883}
884
885#[cfg(test)]
886mod tests {
887    use super::*;
888    use std::path::PathBuf;
889
890    #[test]
891    fn test_extract_python() {
892        let extractor = Extractor::new();
893        let content = r#"
894def foo(x: int) -> str:
895    """Convert int to string."""
896    return str(x)
897
898class Bar:
899    """A bar class."""
900    def method(self):
901        pass
902"#;
903        let result = extractor.extract(&PathBuf::from("test.py"), content);
904        assert_eq!(result.symbols.len(), 2);
905
906        let foo = &result.symbols[0];
907        assert_eq!(foo.name, "foo");
908        assert!(foo.signature.contains("def foo"));
909        assert_eq!(foo.docstring.as_deref(), Some("Convert int to string."));
910
911        let bar = &result.symbols[1];
912        assert_eq!(bar.name, "Bar");
913        assert_eq!(bar.children.len(), 1);
914        assert_eq!(bar.children[0].name, "method");
915    }
916
917    #[test]
918    fn test_extract_rust() {
919        let extractor = Extractor::new();
920        let content = r#"
921/// A simple struct
922pub struct Foo {
923    x: i32,
924}
925
926impl Foo {
927    /// Create a new Foo
928    pub fn new(x: i32) -> Self {
929        Self { x }
930    }
931}
932"#;
933        let result = extractor.extract(&PathBuf::from("test.rs"), content);
934
935        // Should have struct with method from impl merged
936        let foo = result.symbols.iter().find(|s| s.name == "Foo").unwrap();
937        assert!(foo.signature.contains("pub struct Foo"));
938        assert_eq!(foo.children.len(), 1);
939        assert_eq!(foo.children[0].name, "new");
940    }
941
942    #[test]
943    fn test_include_private() {
944        let extractor = Extractor::with_options(ExtractOptions {
945            include_private: true,
946        });
947        let content = r#"
948fn private_fn() {}
949pub fn public_fn() {}
950"#;
951        let result = extractor.extract(&PathBuf::from("test.rs"), content);
952        let names: Vec<_> = result.symbols.iter().map(|s| s.name.as_str()).collect();
953        assert!(names.contains(&"private_fn"));
954        assert!(names.contains(&"public_fn"));
955    }
956
957    #[test]
958    fn test_typescript_interface_impl_detection() {
959        let extractor = Extractor::new();
960        let content = r#"
961interface IFoo {
962  bar(): void;
963  baz(): number;
964}
965
966class Foo implements IFoo {
967  bar() {}
968  baz() { return 1; }
969  other() {}
970}
971"#;
972        let result = extractor.extract(&PathBuf::from("test.ts"), content);
973
974        // Should have interface and class
975        assert_eq!(result.symbols.len(), 2);
976
977        let interface = &result.symbols[0];
978        assert_eq!(interface.name, "IFoo");
979        assert_eq!(interface.children.len(), 2); // bar, baz
980
981        let class = &result.symbols[1];
982        assert_eq!(class.name, "Foo");
983        assert_eq!(class.implements, vec!["IFoo"]);
984        assert_eq!(class.children.len(), 3); // bar, baz, other
985
986        // bar and baz should be marked as interface implementations
987        let bar = class.children.iter().find(|c| c.name == "bar").unwrap();
988        let baz = class.children.iter().find(|c| c.name == "baz").unwrap();
989        let other = class.children.iter().find(|c| c.name == "other").unwrap();
990
991        assert!(
992            bar.is_interface_impl,
993            "bar should be marked as interface impl"
994        );
995        assert!(
996            baz.is_interface_impl,
997            "baz should be marked as interface impl"
998        );
999        assert!(
1000            !other.is_interface_impl,
1001            "other should NOT be marked as interface impl"
1002        );
1003    }
1004
1005    #[test]
1006    fn test_cross_file_interface_impl_with_mock_resolver() {
1007        // Mock resolver that returns methods for IRemote interface
1008        struct MockResolver;
1009        impl InterfaceResolver for MockResolver {
1010            fn resolve_interface_methods(
1011                &self,
1012                name: &str,
1013                _current_file: &str,
1014            ) -> Option<Vec<String>> {
1015                if name == "IRemote" {
1016                    Some(vec![
1017                        "remoteMethod".to_string(),
1018                        "anotherRemote".to_string(),
1019                    ])
1020                } else {
1021                    None
1022                }
1023            }
1024        }
1025
1026        let extractor = Extractor::new();
1027        // Class implements IRemote which is NOT in this file
1028        let content = r#"
1029class Foo implements IRemote {
1030  remoteMethod() {}
1031  anotherRemote() { return 1; }
1032  localMethod() {}
1033}
1034"#;
1035        let resolver = MockResolver;
1036        let result =
1037            extractor.extract_with_resolver(&PathBuf::from("test.ts"), content, Some(&resolver));
1038
1039        assert_eq!(result.symbols.len(), 1);
1040
1041        let class = &result.symbols[0];
1042        assert_eq!(class.name, "Foo");
1043        assert_eq!(class.implements, vec!["IRemote"]);
1044        assert_eq!(class.children.len(), 3);
1045
1046        // remoteMethod and anotherRemote should be marked as interface implementations
1047        let remote_method = class
1048            .children
1049            .iter()
1050            .find(|c| c.name == "remoteMethod")
1051            .unwrap();
1052        let another_remote = class
1053            .children
1054            .iter()
1055            .find(|c| c.name == "anotherRemote")
1056            .unwrap();
1057        let local_method = class
1058            .children
1059            .iter()
1060            .find(|c| c.name == "localMethod")
1061            .unwrap();
1062
1063        assert!(
1064            remote_method.is_interface_impl,
1065            "remoteMethod should be marked as interface impl"
1066        );
1067        assert!(
1068            another_remote.is_interface_impl,
1069            "anotherRemote should be marked as interface impl"
1070        );
1071        assert!(
1072            !local_method.is_interface_impl,
1073            "localMethod should NOT be marked as interface impl"
1074        );
1075    }
1076
1077    #[test]
1078    fn test_cross_file_resolver_not_found() {
1079        // Mock resolver that returns None (interface not found)
1080        struct NotFoundResolver;
1081        impl InterfaceResolver for NotFoundResolver {
1082            fn resolve_interface_methods(
1083                &self,
1084                _name: &str,
1085                _current_file: &str,
1086            ) -> Option<Vec<String>> {
1087                None
1088            }
1089        }
1090
1091        let extractor = Extractor::new();
1092        let content = r#"
1093class Foo implements IUnknown {
1094  someMethod() {}
1095}
1096"#;
1097        let resolver = NotFoundResolver;
1098        let result =
1099            extractor.extract_with_resolver(&PathBuf::from("test.ts"), content, Some(&resolver));
1100
1101        let class = &result.symbols[0];
1102        // When interface is not found, methods should NOT be marked as interface impl
1103        let some_method = class
1104            .children
1105            .iter()
1106            .find(|c| c.name == "someMethod")
1107            .unwrap();
1108        assert!(
1109            !some_method.is_interface_impl,
1110            "someMethod should NOT be marked when interface not found"
1111        );
1112    }
1113
1114    // -- implements extraction tests across languages --
1115
1116    /// Returns `None` if no grammar is available for `file`'s extension — tests
1117    /// then early-return cleanly instead of asserting on an empty extraction.
1118    /// `Some(vec)` may still be empty if the file has no `implements` symbols.
1119    ///
1120    /// Run `cargo xtask build-grammars` and set
1121    /// `NORMALIZE_GRAMMAR_PATH=target/grammars` to enable all language tests.
1122    fn extract_implements(file: &str, code: &str) -> Option<Vec<(String, Vec<String>)>> {
1123        // Probe whether the underlying grammar is loadable before extracting.
1124        // `support_for_path` checks our Language registry; `parse_with_grammar`
1125        // ensures the .so is actually present on disk.
1126        let path = PathBuf::from(file);
1127        let support = normalize_languages::support_for_path(&path)?;
1128        let grammar = support.grammar_name();
1129        parsers::parse_with_grammar(grammar, code)?;
1130        let extractor = Extractor::new();
1131        let result = extractor.extract(&path, code);
1132        fn collect(symbols: &[normalize_languages::Symbol]) -> Vec<(String, Vec<String>)> {
1133            let mut out = Vec::new();
1134            for s in symbols {
1135                if !s.implements.is_empty() {
1136                    out.push((s.name.clone(), s.implements.clone()));
1137                }
1138                out.extend(collect(&s.children));
1139            }
1140            out
1141        }
1142        Some(collect(&result.symbols))
1143    }
1144
1145    #[test]
1146    fn test_implements_python() {
1147        let Some(results) = extract_implements("test.py", "class Foo(Bar, Baz):\n    pass\n")
1148        else {
1149            return;
1150        };
1151        assert_eq!(
1152            results,
1153            vec![("Foo".into(), vec!["Bar".into(), "Baz".into()])]
1154        );
1155    }
1156
1157    #[test]
1158    fn test_implements_rust() {
1159        let Some(results) = extract_implements(
1160            "test.rs",
1161            "pub trait MyTrait {}\npub struct Foo;\nimpl MyTrait for Foo {}\n",
1162        ) else {
1163            return;
1164        };
1165        let impl_sym = results.iter().find(|(n, _)| n == "Foo").unwrap();
1166        assert_eq!(impl_sym.1, vec!["MyTrait"]);
1167    }
1168
1169    #[test]
1170    fn test_implements_java() {
1171        let Some(results) = extract_implements(
1172            "test.java",
1173            "class Foo extends Bar implements Baz, Qux {}\n",
1174        ) else {
1175            return;
1176        };
1177        assert_eq!(
1178            results,
1179            vec![("Foo".into(), vec!["Bar".into(), "Baz".into(), "Qux".into()])]
1180        );
1181    }
1182
1183    #[test]
1184    fn test_implements_javascript() {
1185        let Some(results) = extract_implements("test.js", "class Foo extends Bar {}\n") else {
1186            return;
1187        };
1188        assert_eq!(results, vec![("Foo".into(), vec!["Bar".into()])]);
1189    }
1190
1191    #[test]
1192    fn test_implements_typescript() {
1193        let Some(results) =
1194            extract_implements("test.ts", "class Foo extends Bar implements Baz {}\n")
1195        else {
1196            return;
1197        };
1198        assert_eq!(
1199            results,
1200            vec![("Foo".into(), vec!["Bar".into(), "Baz".into()])]
1201        );
1202    }
1203
1204    #[test]
1205    fn test_implements_cpp() {
1206        let Some(results) = extract_implements(
1207            "test.cpp",
1208            "class Derived : public Base, public Other {};\n",
1209        ) else {
1210            return;
1211        };
1212        assert_eq!(
1213            results,
1214            vec![("Derived".into(), vec!["Base".into(), "Other".into()])]
1215        );
1216    }
1217
1218    #[test]
1219    fn test_implements_scala() {
1220        let Some(results) = extract_implements("test.scala", "class Foo extends Bar with Baz {}\n")
1221        else {
1222            return;
1223        };
1224        assert_eq!(
1225            results,
1226            vec![("Foo".into(), vec!["Bar".into(), "Baz".into()])]
1227        );
1228    }
1229
1230    #[test]
1231    fn test_implements_ruby() {
1232        let Some(results) = extract_implements("test.rb", "class Foo < Bar\nend\n") else {
1233            return;
1234        };
1235        assert_eq!(results, vec![("Foo".into(), vec!["Bar".into()])]);
1236    }
1237
1238    #[test]
1239    fn test_implements_dart() {
1240        let Some(results) =
1241            extract_implements("test.dart", "class Foo extends Bar implements Baz {}\n")
1242        else {
1243            return;
1244        };
1245        assert_eq!(
1246            results,
1247            vec![("Foo".into(), vec!["Bar".into(), "Baz".into()])]
1248        );
1249    }
1250
1251    #[test]
1252    fn test_implements_d() {
1253        let Some(results) = extract_implements("test.d", "class Derived : Base, IFoo {}\n") else {
1254            return;
1255        };
1256        assert_eq!(
1257            results,
1258            vec![("Derived".into(), vec!["Base".into(), "IFoo".into()])]
1259        );
1260    }
1261
1262    #[test]
1263    fn test_implements_csharp() {
1264        let Some(results) = extract_implements("test.cs", "class Foo : Bar, IBaz {}\n") else {
1265            return;
1266        };
1267        assert_eq!(
1268            results,
1269            vec![("Foo".into(), vec!["Bar".into(), "IBaz".into()])]
1270        );
1271    }
1272
1273    #[test]
1274    fn test_implements_kotlin() {
1275        let Some(results) = extract_implements("test.kt", "class Foo : Bar(), IBaz {}\n") else {
1276            return;
1277        };
1278        assert_eq!(
1279            results,
1280            vec![("Foo".into(), vec!["Bar".into(), "IBaz".into()])]
1281        );
1282    }
1283
1284    #[test]
1285    fn test_implements_swift() {
1286        let Some(results) = extract_implements("test.swift", "class Foo: Bar, Proto {}\n") else {
1287            return;
1288        };
1289        assert_eq!(
1290            results,
1291            vec![("Foo".into(), vec!["Bar".into(), "Proto".into()])]
1292        );
1293    }
1294
1295    #[test]
1296    fn test_implements_php() {
1297        let Some(results) = extract_implements(
1298            "test.php",
1299            "<?php\nclass Foo extends Bar implements Baz {}\n",
1300        ) else {
1301            return;
1302        };
1303        assert_eq!(
1304            results,
1305            vec![("Foo".into(), vec!["Bar".into(), "Baz".into()])]
1306        );
1307    }
1308
1309    #[test]
1310    fn test_implements_objc() {
1311        let Some(results) = extract_implements("test.mm", "@interface Foo : Bar <Proto>\n@end\n")
1312        else {
1313            return;
1314        };
1315        assert_eq!(
1316            results,
1317            vec![("Foo".into(), vec!["Bar".into(), "Proto".into()])]
1318        );
1319    }
1320
1321    #[test]
1322    fn test_implements_matlab() {
1323        // MATLAB and ObjC both use .m; use .m and detect which language we get
1324        let Some(results) = extract_implements("test.m", "classdef Foo < Bar & Baz\nend\n") else {
1325            return;
1326        };
1327        // If .m resolves to ObjC, this file won't parse as valid ObjC so we get []
1328        // Skip this test if the extension resolves to the wrong language
1329        if results.is_empty() {
1330            // .m resolved to ObjC instead of MATLAB — skip
1331            return;
1332        }
1333        assert_eq!(
1334            results,
1335            vec![("Foo".into(), vec!["Bar".into(), "Baz".into()])]
1336        );
1337    }
1338
1339    #[test]
1340    fn test_implements_graphql() {
1341        let Some(results) = extract_implements(
1342            "test.graphql",
1343            "type Foo implements Bar & Baz { id: ID! }\n",
1344        ) else {
1345            return;
1346        };
1347        assert_eq!(
1348            results,
1349            vec![("Foo".into(), vec!["Bar".into(), "Baz".into()])]
1350        );
1351    }
1352
1353    #[test]
1354    fn test_implements_haskell() {
1355        let Some(results) =
1356            extract_implements("test.hs", "instance MyClass Foo where\n  doStuff f = y f\n")
1357        else {
1358            return;
1359        };
1360        assert_eq!(results, vec![("MyClass".into(), vec!["MyClass".into()])]);
1361    }
1362
1363    #[test]
1364    fn test_go_extract() {
1365        if parsers::parse_with_grammar("go", "package x").is_none() {
1366            return; // Go grammar not built; run `cargo xtask build-grammars`.
1367        }
1368        let extractor = Extractor::new();
1369        let content = "package main\n\nfunc helper() {}\n\ntype MyStruct struct {\n    Field int\n}\n\nfunc (m *MyStruct) Method() {}\n\ntype MyInterface interface {\n    Required()\n}\n";
1370        let result = extractor.extract(&std::path::PathBuf::from("test.go"), content);
1371        let names: Vec<_> = result.symbols.iter().map(|s| s.name.as_str()).collect();
1372        assert!(names.contains(&"helper"), "Should have function helper");
1373        assert!(names.contains(&"MyStruct"), "Should have struct MyStruct");
1374        assert!(
1375            names.contains(&"MyInterface"),
1376            "Should have interface MyInterface"
1377        );
1378    }
1379}