llmcc_core/
meta.rs

1//! Module detection using a Patricia trie to compress file paths into 4 architecture levels.
2//!
3//! # The 4-Level Architecture
4//!
5//! Every source file is mapped to exactly 4 semantic levels:
6//! - **Level 0 (Project)**: The entire repository/workspace
7//! - **Level 1 (Package)**: A distributable unit (npm package, Rust crate) - DEVELOPER DEFINED
8//! - **Level 2 (Module)**: A major subsystem within a package - INFERRED
9//! - **Level 3 (File)**: The individual source file
10//!
11//! # Philosophy
12//!
13//! Packages (Cargo.toml, package.json) are the **real semantic boundaries** - developers
14//! explicitly created them. We respect these as-is.
15//!
16//! For modules, we use a per-file bottom-up approach: walk up from each file toward the
17//! package root, finding the first directory that represents a meaningful grouping.
18//!
19//! # Algorithm: Per-File Bottom-Up Module Detection
20//!
21//! For each file:
22//! 1. Get path components from package root (excluding containers like `src/`)
23//! 2. Walk UP from deepest to shallowest
24//! 3. Find the first ancestor where going "deeper" provides meaningful subdivision
25//!
26//! A directory is a good module boundary if:
27//! - It has siblings (alternatives at the same level)
28//! - Its siblings collectively represent >20% of the package's files
29//!
30//! This naturally handles variable depths - different subtrees can have different module levels.
31
32use std::collections::HashMap;
33use std::path::{Path, PathBuf};
34
35// ============================================================================
36// Public Types
37// ============================================================================
38
39/// The four fixed architecture levels.
40#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
41pub enum ArchDepth {
42    Project = 0,
43    Package = 1,
44    Module = 2,
45    File = 3,
46}
47
48impl ArchDepth {
49    pub fn as_u8(self) -> u8 {
50        self as u8
51    }
52
53    pub fn from_u8(depth: u8) -> Option<Self> {
54        match depth {
55            0 => Some(Self::Project),
56            1 => Some(Self::Package),
57            2 => Some(Self::Module),
58            3 => Some(Self::File),
59            _ => None,
60        }
61    }
62}
63
64/// Complete location info for a source file at all 4 depths.
65#[derive(Debug, Clone, Default)]
66pub struct UnitMeta {
67    pub project_name: Option<String>,
68    pub project_root: Option<PathBuf>,
69    pub package_name: Option<String>,
70    pub package_root: Option<PathBuf>,
71    pub module_name: Option<String>,
72    pub module_root: Option<PathBuf>,
73    pub file_name: Option<String>,
74    pub file_path: Option<PathBuf>,
75    /// Unique index for the crate/package this file belongs to.
76    /// All files in the same crate share the same crate_index.
77    /// Used for efficient same-crate preference during symbol lookup.
78    pub crate_index: usize,
79}
80
81impl UnitMeta {
82    pub fn name_at_depth(&self, depth: ArchDepth) -> Option<&str> {
83        match depth {
84            ArchDepth::Project => self.project_name.as_deref(),
85            ArchDepth::Package => self.package_name.as_deref(),
86            ArchDepth::Module => self.module_name.as_deref(),
87            ArchDepth::File => self.file_name.as_deref(),
88        }
89    }
90
91    pub fn root_at_depth(&self, depth: ArchDepth) -> Option<&Path> {
92        match depth {
93            ArchDepth::Project => self.project_root.as_deref(),
94            ArchDepth::Package => self.package_root.as_deref(),
95            ArchDepth::Module => self.module_root.as_deref(),
96            ArchDepth::File => self.file_path.as_deref(),
97        }
98    }
99
100    pub fn qualified_name(&self, depth: ArchDepth) -> String {
101        let mut parts = Vec::new();
102        for d in 0..=depth.as_u8() {
103            if let Some(arch_depth) = ArchDepth::from_u8(d)
104                && let Some(name) = self.name_at_depth(arch_depth)
105            {
106                parts.push(name);
107            }
108        }
109        parts.join(".")
110    }
111}
112
113// ============================================================================
114// Trie Node
115// ============================================================================
116
117/// A node in the Patricia trie.
118///
119/// Each node represents a semantic directory (containers are skipped).
120/// file_count tracks files directly at this node's level.
121#[derive(Debug, Clone, Default)]
122struct TrieNode {
123    file_count: usize,
124    children: HashMap<String, TrieNode>,
125}
126
127impl TrieNode {
128    fn new() -> Self {
129        Self::default()
130    }
131
132    /// Total files in this subtree (recursive).
133    fn total_files(&self) -> usize {
134        self.file_count
135            + self
136                .children
137                .values()
138                .map(|c| c.total_files())
139                .sum::<usize>()
140    }
141}
142
143// ============================================================================
144// Package Info
145// ============================================================================
146
147#[derive(Debug, Clone)]
148struct PackageInfo {
149    name: String,
150    root: PathBuf,
151    trie: TrieNode,
152    total_files: usize,
153    /// True if this package was detected from an actual manifest file,
154    /// false if it's a synthetic fallback package.
155    has_manifest: bool,
156}
157
158// ============================================================================
159// Module Detector
160// ============================================================================
161
162/// Detects and caches module structure for a project.
163pub struct UnitMetaBuilder {
164    manifest_name: &'static str,
165    container_dirs: &'static [&'static str],
166    project_root: PathBuf,
167    project_name: String,
168    packages: Vec<PackageInfo>,
169}
170
171impl UnitMetaBuilder {
172    /// Create a detector using LanguageTrait configuration.
173    /// This is the preferred way to create a UnitMetaBuilder from a language type.
174    /// Automatically computes the project root from file paths.
175    pub fn from_lang_trait<L: crate::lang_def::LanguageTrait>(files: &[PathBuf]) -> Self {
176        Self::with_lang_config(files, L::manifest_name(), L::container_dirs())
177    }
178
179    /// Create a detector with explicit language configuration.
180    /// Automatically computes the project root from file paths.
181    pub fn with_lang_config(
182        files: &[PathBuf],
183        manifest_name: &'static str,
184        container_dirs: &'static [&'static str],
185    ) -> Self {
186        let project_root = Self::compute_project_root(files);
187        let project_name = project_root
188            .file_name()
189            .and_then(|n| n.to_str())
190            .unwrap_or("project")
191            .to_string();
192
193        let mut detector = Self {
194            manifest_name,
195            container_dirs,
196            project_root,
197            project_name,
198            packages: Vec::new(),
199        };
200
201        detector.detect_packages(files);
202
203        // If no packages were detected, use the project root as a fallback "package"
204        // This ensures module detection works even without manifest files
205        if detector.packages.is_empty() {
206            detector.packages.push(PackageInfo {
207                name: detector.project_name.clone(),
208                root: detector.project_root.clone(),
209                trie: TrieNode::new(),
210                total_files: 0,
211                has_manifest: false, // Synthetic fallback, no actual manifest
212            });
213        }
214
215        detector.build_tries(files);
216
217        detector
218    }
219
220    /// Compute the project root as the common ancestor directory of all file paths.
221    /// Optimized O(n) implementation with early termination.
222    fn compute_project_root(paths: &[PathBuf]) -> PathBuf {
223        if paths.is_empty() {
224            return PathBuf::new();
225        }
226
227        // Start with first file's parent
228        let first = match paths[0].parent() {
229            Some(p) => p,
230            None => return PathBuf::new(),
231        };
232        let mut common: Vec<_> = first.components().collect();
233
234        // Shrink common prefix with each file - early exit if empty
235        for path in &paths[1..] {
236            if common.is_empty() {
237                break;
238            }
239            let parent = path.parent().unwrap_or(path);
240            let mut match_len = 0;
241            for (a, b) in common.iter().zip(parent.components()) {
242                if *a == b {
243                    match_len += 1;
244                } else {
245                    break;
246                }
247            }
248            common.truncate(match_len);
249        }
250
251        common.iter().collect()
252    }
253
254    fn is_container(&self, name: &str) -> bool {
255        self.container_dirs.contains(&name)
256    }
257
258    /// Get module info for a file path.
259    pub fn get_module_info(&self, file: &Path) -> UnitMeta {
260        self.compute_module_info(file)
261    }
262
263    // ========================================================================
264    // Step 1: Detect Packages
265    // ========================================================================
266
267    fn detect_packages(&mut self, files: &[PathBuf]) {
268        let mut seen = std::collections::HashSet::new();
269
270        for file in files {
271            let mut dir = file.parent();
272            while let Some(current) = dir {
273                let manifest = current.join(self.manifest_name);
274                if manifest.exists() && !seen.contains(current) {
275                    seen.insert(current.to_path_buf());
276
277                    if let Some(name) = self.parse_manifest_name(current) {
278                        self.packages.push(PackageInfo {
279                            name,
280                            root: current.to_path_buf(),
281                            trie: TrieNode::new(),
282                            total_files: 0,
283                            has_manifest: true, // Real package from manifest file
284                        });
285                    }
286                    break;
287                }
288                dir = current.parent();
289            }
290        }
291
292        // Sort by depth (deepest first) for nested package detection
293        self.packages.sort_by(|a, b| {
294            b.root
295                .components()
296                .count()
297                .cmp(&a.root.components().count())
298        });
299    }
300
301    fn parse_manifest_name(&self, dir: &Path) -> Option<String> {
302        let content = std::fs::read_to_string(dir.join(self.manifest_name)).ok()?;
303
304        // Try JSON format first (package.json)
305        if self.manifest_name == "package.json" {
306            // Parse "name": "value" from JSON
307            let name_pos = content.find("\"name\"")?;
308            let after_name = &content[name_pos + 6..];
309            let colon_pos = after_name.find(':')?;
310            let after_colon = &after_name[colon_pos + 1..];
311            let start_quote = after_colon.find('"')?;
312            let value_start = &after_colon[start_quote + 1..];
313            let end_quote = value_start.find('"')?;
314            let value = &value_start[..end_quote];
315            Some(value.replace('@', "").replace('/', "_"))
316        } else if self.manifest_name == "Cargo.toml" {
317            // Parse TOML format (Cargo.toml)
318            let mut in_package = false;
319            for line in content.lines() {
320                let line = line.trim();
321                if line == "[package]" {
322                    in_package = true;
323                } else if line.starts_with('[') {
324                    in_package = false;
325                } else if in_package && line.starts_with("name") {
326                    return line
327                        .find('=')
328                        .map(|pos| line[pos + 1..].trim().trim_matches('"').to_string());
329                }
330            }
331            None
332        } else {
333            // Unknown manifest format - use directory name
334            dir.file_name()
335                .and_then(|n| n.to_str())
336                .map(|s| s.to_string())
337        }
338    }
339
340    // ========================================================================
341    // Step 2: Build Tries
342    // ========================================================================
343
344    fn build_tries(&mut self, files: &[PathBuf]) {
345        let all_roots: Vec<PathBuf> = self.packages.iter().map(|p| p.root.clone()).collect();
346
347        for pkg in &mut self.packages {
348            for file in files {
349                // Skip files not in this package
350                if !file.starts_with(&pkg.root) {
351                    continue;
352                }
353
354                // Skip files belonging to nested packages
355                let in_nested = all_roots.iter().any(|other| {
356                    *other != pkg.root && other.starts_with(&pkg.root) && file.starts_with(other)
357                });
358                if in_nested {
359                    continue;
360                }
361
362                Self::insert_file(&mut pkg.trie, file, &pkg.root, self.container_dirs);
363                pkg.total_files += 1;
364            }
365
366            tracing::debug!("Package '{}': {} files in trie", pkg.name, pkg.total_files);
367        }
368    }
369
370    fn insert_file(trie: &mut TrieNode, file: &Path, pkg_root: &Path, container_dirs: &[&str]) {
371        let rel_path = match file.strip_prefix(pkg_root) {
372            Ok(p) => p,
373            Err(_) => return,
374        };
375
376        // Get directory components, skipping containers
377        let mut current = trie;
378        for comp in rel_path
379            .parent()
380            .into_iter()
381            .flat_map(|p| p.components())
382            .filter_map(|c| c.as_os_str().to_str())
383        {
384            if container_dirs.contains(&comp) {
385                continue; // Skip container directories
386            }
387            current = current.children.entry(comp.to_string()).or_default();
388        }
389
390        current.file_count += 1;
391    }
392
393    // ========================================================================
394    // Per-File Module Detection
395    // ========================================================================
396
397    /// Find the module for a file by walking up from the file to the package root.
398    ///
399    /// Strategy: Find the first ancestor directory that represents a meaningful grouping.
400    /// A directory is "meaningful" if:
401    /// 1. It has siblings (alternatives at the same level), AND
402    /// 2. Those siblings collectively have significant file counts
403    ///
404    /// This naturally handles variable depths for different subtrees.
405    fn find_module_for_file<'a>(
406        &self,
407        components: &[&'a str],
408        pkg: &PackageInfo,
409    ) -> Option<(usize, &'a str)> {
410        if components.is_empty() {
411            return None;
412        }
413
414        // Walk the trie along the file's path, collecting nodes and checking siblings
415        let mut current = &pkg.trie;
416        let mut path_nodes: Vec<(&str, &TrieNode, usize)> = Vec::new(); // (name, node, sibling_files)
417
418        for comp in components.iter() {
419            if let Some(child) = current.children.get(*comp) {
420                // Calculate sibling file count (files in siblings, not in this subtree)
421                let sibling_files: usize = current
422                    .children
423                    .iter()
424                    .filter(|(name, _)| *name != *comp)
425                    .map(|(_, node)| node.total_files())
426                    .sum();
427
428                path_nodes.push((*comp, child, sibling_files));
429                current = child;
430            } else {
431                // Component not in trie (shouldn't happen normally)
432                break;
433            }
434        }
435
436        // Walk from ROOT to LEAF looking for the best module boundary
437        // A good boundary has:
438        // 1. Significant siblings (not alone)
439        // 2. Balanced distribution (no sibling dominates >80%)
440        //
441        // If we find a significant but imbalanced level, keep looking deeper
442        let significance_threshold = (pkg.total_files as f64 * 0.05).max(1.0) as usize;
443        const DOMINANCE_THRESHOLD: f64 = 0.80;
444
445        for (i, (name, node, sibling_files)) in path_nodes.iter().enumerate() {
446            if *sibling_files >= significance_threshold {
447                // Significant siblings - check balance
448                let my_files = node.total_files();
449                let total = my_files + sibling_files;
450                let dominance = my_files as f64 / total as f64;
451
452                if dominance <= DOMINANCE_THRESHOLD {
453                    // Balanced - use this level
454                    return Some((i, *name));
455                }
456                // Imbalanced - we dominate, keep looking for a better split deeper
457            }
458        }
459
460        // No balanced split found - use the first component
461        path_nodes.first().map(|(name, _, _)| (0, *name))
462    }
463
464    // ========================================================================
465    // Module Info Lookup
466    // ========================================================================
467
468    fn compute_module_info(&self, file: &Path) -> UnitMeta {
469        let mut info = UnitMeta {
470            project_name: Some(self.project_name.clone()),
471            project_root: Some(self.project_root.clone()),
472            file_path: Some(file.to_path_buf()),
473            file_name: file.file_stem().and_then(|s| s.to_str()).map(String::from),
474            ..Default::default()
475        };
476
477        // Find package
478        let pkg = self
479            .packages
480            .iter()
481            .filter(|p| file.starts_with(&p.root))
482            .max_by_key(|p| p.root.components().count());
483
484        let Some(pkg) = pkg else {
485            return info;
486        };
487
488        // Only set package_name if this is a real package with a manifest file
489        // Synthetic fallback packages should not create a package cluster
490        if pkg.has_manifest {
491            info.package_name = Some(pkg.name.clone());
492            info.package_root = Some(pkg.root.clone());
493        }
494
495        // Get path components (excluding containers)
496        let rel_path = match file.strip_prefix(&pkg.root) {
497            Ok(p) => p,
498            Err(_) => return info,
499        };
500
501        let components: Vec<&str> = rel_path
502            .parent()
503            .into_iter()
504            .flat_map(|p| p.components())
505            .filter_map(|c| c.as_os_str().to_str())
506            .filter(|c| !self.is_container(c))
507            .collect();
508
509        // Find module using per-file bottom-up detection
510        if let Some((depth, module_name)) = self.find_module_for_file(&components, pkg) {
511            info.module_name = Some(module_name.to_string());
512
513            // Reconstruct module root path
514            let mut root = pkg.root.clone();
515            let mut non_container_count = 0;
516            for comp in rel_path
517                .parent()
518                .into_iter()
519                .flat_map(|p| p.components())
520                .filter_map(|c| c.as_os_str().to_str())
521            {
522                root = root.join(comp);
523                if !self.is_container(comp) {
524                    if non_container_count == depth {
525                        info.module_root = Some(root);
526                        break;
527                    }
528                    non_container_count += 1;
529                }
530            }
531        }
532        // If no module found (file at package root), module_name stays None
533        // and the graph generator will use the file name
534
535        info
536    }
537}