llmcc_core/meta.rs
1//! Module detection using a Patricia trie to compress file paths into 4 architecture levels.
2//!
3//! # The 4-Level Architecture
4//!
5//! Every source file is mapped to exactly 4 semantic levels:
6//! - **Level 0 (Project)**: The entire repository/workspace
7//! - **Level 1 (Package)**: A distributable unit (npm package, Rust crate) - DEVELOPER DEFINED
8//! - **Level 2 (Module)**: A major subsystem within a package - INFERRED
9//! - **Level 3 (File)**: The individual source file
10//!
11//! # Philosophy
12//!
13//! Packages (Cargo.toml, package.json) are the **real semantic boundaries** - developers
14//! explicitly created them. We respect these as-is.
15//!
16//! For modules, we use a per-file bottom-up approach: walk up from each file toward the
17//! package root, finding the first directory that represents a meaningful grouping.
18//!
19//! # Algorithm: Per-File Bottom-Up Module Detection
20//!
21//! For each file:
22//! 1. Get path components from package root (excluding containers like `src/`)
23//! 2. Walk UP from deepest to shallowest
24//! 3. Find the first ancestor where going "deeper" provides meaningful subdivision
25//!
26//! A directory is a good module boundary if:
27//! - It has siblings (alternatives at the same level)
28//! - Its siblings collectively represent >20% of the package's files
29//!
30//! This naturally handles variable depths - different subtrees can have different module levels.
31
32use std::collections::HashMap;
33use std::path::{Path, PathBuf};
34
35// ============================================================================
36// Public Types
37// ============================================================================
38
39/// The four fixed architecture levels.
40#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
41pub enum ArchDepth {
42 Project = 0,
43 Package = 1,
44 Module = 2,
45 File = 3,
46}
47
48impl ArchDepth {
49 pub fn as_u8(self) -> u8 {
50 self as u8
51 }
52
53 pub fn from_u8(depth: u8) -> Option<Self> {
54 match depth {
55 0 => Some(Self::Project),
56 1 => Some(Self::Package),
57 2 => Some(Self::Module),
58 3 => Some(Self::File),
59 _ => None,
60 }
61 }
62}
63
64/// Complete location info for a source file at all 4 depths.
65#[derive(Debug, Clone, Default)]
66pub struct UnitMeta {
67 pub project_name: Option<String>,
68 pub project_root: Option<PathBuf>,
69 pub package_name: Option<String>,
70 pub package_root: Option<PathBuf>,
71 pub module_name: Option<String>,
72 pub module_root: Option<PathBuf>,
73 pub file_name: Option<String>,
74 pub file_path: Option<PathBuf>,
75 /// Unique index for the crate/package this file belongs to.
76 /// All files in the same crate share the same crate_index.
77 /// Used for efficient same-crate preference during symbol lookup.
78 pub crate_index: usize,
79}
80
81impl UnitMeta {
82 pub fn name_at_depth(&self, depth: ArchDepth) -> Option<&str> {
83 match depth {
84 ArchDepth::Project => self.project_name.as_deref(),
85 ArchDepth::Package => self.package_name.as_deref(),
86 ArchDepth::Module => self.module_name.as_deref(),
87 ArchDepth::File => self.file_name.as_deref(),
88 }
89 }
90
91 pub fn root_at_depth(&self, depth: ArchDepth) -> Option<&Path> {
92 match depth {
93 ArchDepth::Project => self.project_root.as_deref(),
94 ArchDepth::Package => self.package_root.as_deref(),
95 ArchDepth::Module => self.module_root.as_deref(),
96 ArchDepth::File => self.file_path.as_deref(),
97 }
98 }
99
100 pub fn qualified_name(&self, depth: ArchDepth) -> String {
101 let mut parts = Vec::new();
102 for d in 0..=depth.as_u8() {
103 if let Some(arch_depth) = ArchDepth::from_u8(d)
104 && let Some(name) = self.name_at_depth(arch_depth)
105 {
106 parts.push(name);
107 }
108 }
109 parts.join(".")
110 }
111}
112
113// ============================================================================
114// Trie Node
115// ============================================================================
116
117/// A node in the Patricia trie.
118///
119/// Each node represents a semantic directory (containers are skipped).
120/// file_count tracks files directly at this node's level.
121#[derive(Debug, Clone, Default)]
122struct TrieNode {
123 file_count: usize,
124 children: HashMap<String, TrieNode>,
125}
126
127impl TrieNode {
128 fn new() -> Self {
129 Self::default()
130 }
131
132 /// Total files in this subtree (recursive).
133 fn total_files(&self) -> usize {
134 self.file_count
135 + self
136 .children
137 .values()
138 .map(|c| c.total_files())
139 .sum::<usize>()
140 }
141}
142
143// ============================================================================
144// Package Info
145// ============================================================================
146
147#[derive(Debug, Clone)]
148struct PackageInfo {
149 name: String,
150 root: PathBuf,
151 trie: TrieNode,
152 total_files: usize,
153 /// True if this package was detected from an actual manifest file,
154 /// false if it's a synthetic fallback package.
155 has_manifest: bool,
156}
157
158// ============================================================================
159// Module Detector
160// ============================================================================
161
162/// Detects and caches module structure for a project.
163pub struct UnitMetaBuilder {
164 manifest_name: &'static str,
165 container_dirs: &'static [&'static str],
166 project_root: PathBuf,
167 project_name: String,
168 packages: Vec<PackageInfo>,
169}
170
171impl UnitMetaBuilder {
172 /// Create a detector using LanguageTrait configuration.
173 /// This is the preferred way to create a UnitMetaBuilder from a language type.
174 /// Automatically computes the project root from file paths.
175 pub fn from_lang_trait<L: crate::lang_def::LanguageTrait>(files: &[PathBuf]) -> Self {
176 Self::with_lang_config(files, L::manifest_name(), L::container_dirs())
177 }
178
179 /// Create a detector with explicit language configuration.
180 /// Automatically computes the project root from file paths.
181 pub fn with_lang_config(
182 files: &[PathBuf],
183 manifest_name: &'static str,
184 container_dirs: &'static [&'static str],
185 ) -> Self {
186 let project_root = Self::compute_project_root(files);
187 let project_name = project_root
188 .file_name()
189 .and_then(|n| n.to_str())
190 .unwrap_or("project")
191 .to_string();
192
193 let mut detector = Self {
194 manifest_name,
195 container_dirs,
196 project_root,
197 project_name,
198 packages: Vec::new(),
199 };
200
201 detector.detect_packages(files);
202
203 // If no packages were detected, use the project root as a fallback "package"
204 // This ensures module detection works even without manifest files
205 if detector.packages.is_empty() {
206 detector.packages.push(PackageInfo {
207 name: detector.project_name.clone(),
208 root: detector.project_root.clone(),
209 trie: TrieNode::new(),
210 total_files: 0,
211 has_manifest: false, // Synthetic fallback, no actual manifest
212 });
213 }
214
215 detector.build_tries(files);
216
217 detector
218 }
219
220 /// Compute the project root as the common ancestor directory of all file paths.
221 /// Optimized O(n) implementation with early termination.
222 fn compute_project_root(paths: &[PathBuf]) -> PathBuf {
223 if paths.is_empty() {
224 return PathBuf::new();
225 }
226
227 // Start with first file's parent
228 let first = match paths[0].parent() {
229 Some(p) => p,
230 None => return PathBuf::new(),
231 };
232 let mut common: Vec<_> = first.components().collect();
233
234 // Shrink common prefix with each file - early exit if empty
235 for path in &paths[1..] {
236 if common.is_empty() {
237 break;
238 }
239 let parent = path.parent().unwrap_or(path);
240 let mut match_len = 0;
241 for (a, b) in common.iter().zip(parent.components()) {
242 if *a == b {
243 match_len += 1;
244 } else {
245 break;
246 }
247 }
248 common.truncate(match_len);
249 }
250
251 common.iter().collect()
252 }
253
254 fn is_container(&self, name: &str) -> bool {
255 self.container_dirs.contains(&name)
256 }
257
258 /// Get module info for a file path.
259 pub fn get_module_info(&self, file: &Path) -> UnitMeta {
260 self.compute_module_info(file)
261 }
262
263 // ========================================================================
264 // Step 1: Detect Packages
265 // ========================================================================
266
267 fn detect_packages(&mut self, files: &[PathBuf]) {
268 let mut seen = std::collections::HashSet::new();
269
270 for file in files {
271 let mut dir = file.parent();
272 while let Some(current) = dir {
273 let manifest = current.join(self.manifest_name);
274 if manifest.exists() && !seen.contains(current) {
275 seen.insert(current.to_path_buf());
276
277 if let Some(name) = self.parse_manifest_name(current) {
278 self.packages.push(PackageInfo {
279 name,
280 root: current.to_path_buf(),
281 trie: TrieNode::new(),
282 total_files: 0,
283 has_manifest: true, // Real package from manifest file
284 });
285 }
286 break;
287 }
288 dir = current.parent();
289 }
290 }
291
292 // Sort by depth (deepest first) for nested package detection
293 self.packages.sort_by(|a, b| {
294 b.root
295 .components()
296 .count()
297 .cmp(&a.root.components().count())
298 });
299 }
300
301 fn parse_manifest_name(&self, dir: &Path) -> Option<String> {
302 let content = std::fs::read_to_string(dir.join(self.manifest_name)).ok()?;
303
304 // Try JSON format first (package.json)
305 if self.manifest_name == "package.json" {
306 // Parse "name": "value" from JSON
307 let name_pos = content.find("\"name\"")?;
308 let after_name = &content[name_pos + 6..];
309 let colon_pos = after_name.find(':')?;
310 let after_colon = &after_name[colon_pos + 1..];
311 let start_quote = after_colon.find('"')?;
312 let value_start = &after_colon[start_quote + 1..];
313 let end_quote = value_start.find('"')?;
314 let value = &value_start[..end_quote];
315 Some(value.replace('@', "").replace('/', "_"))
316 } else if self.manifest_name == "Cargo.toml" {
317 // Parse TOML format (Cargo.toml)
318 let mut in_package = false;
319 for line in content.lines() {
320 let line = line.trim();
321 if line == "[package]" {
322 in_package = true;
323 } else if line.starts_with('[') {
324 in_package = false;
325 } else if in_package && line.starts_with("name") {
326 return line
327 .find('=')
328 .map(|pos| line[pos + 1..].trim().trim_matches('"').to_string());
329 }
330 }
331 None
332 } else {
333 // Unknown manifest format - use directory name
334 dir.file_name()
335 .and_then(|n| n.to_str())
336 .map(|s| s.to_string())
337 }
338 }
339
340 // ========================================================================
341 // Step 2: Build Tries
342 // ========================================================================
343
344 fn build_tries(&mut self, files: &[PathBuf]) {
345 let all_roots: Vec<PathBuf> = self.packages.iter().map(|p| p.root.clone()).collect();
346
347 for pkg in &mut self.packages {
348 for file in files {
349 // Skip files not in this package
350 if !file.starts_with(&pkg.root) {
351 continue;
352 }
353
354 // Skip files belonging to nested packages
355 let in_nested = all_roots.iter().any(|other| {
356 *other != pkg.root && other.starts_with(&pkg.root) && file.starts_with(other)
357 });
358 if in_nested {
359 continue;
360 }
361
362 Self::insert_file(&mut pkg.trie, file, &pkg.root, self.container_dirs);
363 pkg.total_files += 1;
364 }
365
366 tracing::debug!("Package '{}': {} files in trie", pkg.name, pkg.total_files);
367 }
368 }
369
370 fn insert_file(trie: &mut TrieNode, file: &Path, pkg_root: &Path, container_dirs: &[&str]) {
371 let rel_path = match file.strip_prefix(pkg_root) {
372 Ok(p) => p,
373 Err(_) => return,
374 };
375
376 // Get directory components, skipping containers
377 let mut current = trie;
378 for comp in rel_path
379 .parent()
380 .into_iter()
381 .flat_map(|p| p.components())
382 .filter_map(|c| c.as_os_str().to_str())
383 {
384 if container_dirs.contains(&comp) {
385 continue; // Skip container directories
386 }
387 current = current.children.entry(comp.to_string()).or_default();
388 }
389
390 current.file_count += 1;
391 }
392
393 // ========================================================================
394 // Per-File Module Detection
395 // ========================================================================
396
397 /// Find the module for a file by walking up from the file to the package root.
398 ///
399 /// Strategy: Find the first ancestor directory that represents a meaningful grouping.
400 /// A directory is "meaningful" if:
401 /// 1. It has siblings (alternatives at the same level), AND
402 /// 2. Those siblings collectively have significant file counts
403 ///
404 /// This naturally handles variable depths for different subtrees.
405 fn find_module_for_file<'a>(
406 &self,
407 components: &[&'a str],
408 pkg: &PackageInfo,
409 ) -> Option<(usize, &'a str)> {
410 if components.is_empty() {
411 return None;
412 }
413
414 // Walk the trie along the file's path, collecting nodes and checking siblings
415 let mut current = &pkg.trie;
416 let mut path_nodes: Vec<(&str, &TrieNode, usize)> = Vec::new(); // (name, node, sibling_files)
417
418 for comp in components.iter() {
419 if let Some(child) = current.children.get(*comp) {
420 // Calculate sibling file count (files in siblings, not in this subtree)
421 let sibling_files: usize = current
422 .children
423 .iter()
424 .filter(|(name, _)| *name != *comp)
425 .map(|(_, node)| node.total_files())
426 .sum();
427
428 path_nodes.push((*comp, child, sibling_files));
429 current = child;
430 } else {
431 // Component not in trie (shouldn't happen normally)
432 break;
433 }
434 }
435
436 // Walk from ROOT to LEAF looking for the best module boundary
437 // A good boundary has:
438 // 1. Significant siblings (not alone)
439 // 2. Balanced distribution (no sibling dominates >80%)
440 //
441 // If we find a significant but imbalanced level, keep looking deeper
442 let significance_threshold = (pkg.total_files as f64 * 0.05).max(1.0) as usize;
443 const DOMINANCE_THRESHOLD: f64 = 0.80;
444
445 for (i, (name, node, sibling_files)) in path_nodes.iter().enumerate() {
446 if *sibling_files >= significance_threshold {
447 // Significant siblings - check balance
448 let my_files = node.total_files();
449 let total = my_files + sibling_files;
450 let dominance = my_files as f64 / total as f64;
451
452 if dominance <= DOMINANCE_THRESHOLD {
453 // Balanced - use this level
454 return Some((i, *name));
455 }
456 // Imbalanced - we dominate, keep looking for a better split deeper
457 }
458 }
459
460 // No balanced split found - use the first component
461 path_nodes.first().map(|(name, _, _)| (0, *name))
462 }
463
464 // ========================================================================
465 // Module Info Lookup
466 // ========================================================================
467
468 fn compute_module_info(&self, file: &Path) -> UnitMeta {
469 let mut info = UnitMeta {
470 project_name: Some(self.project_name.clone()),
471 project_root: Some(self.project_root.clone()),
472 file_path: Some(file.to_path_buf()),
473 file_name: file.file_stem().and_then(|s| s.to_str()).map(String::from),
474 ..Default::default()
475 };
476
477 // Find package
478 let pkg = self
479 .packages
480 .iter()
481 .filter(|p| file.starts_with(&p.root))
482 .max_by_key(|p| p.root.components().count());
483
484 let Some(pkg) = pkg else {
485 return info;
486 };
487
488 // Only set package_name if this is a real package with a manifest file
489 // Synthetic fallback packages should not create a package cluster
490 if pkg.has_manifest {
491 info.package_name = Some(pkg.name.clone());
492 info.package_root = Some(pkg.root.clone());
493 }
494
495 // Get path components (excluding containers)
496 let rel_path = match file.strip_prefix(&pkg.root) {
497 Ok(p) => p,
498 Err(_) => return info,
499 };
500
501 let components: Vec<&str> = rel_path
502 .parent()
503 .into_iter()
504 .flat_map(|p| p.components())
505 .filter_map(|c| c.as_os_str().to_str())
506 .filter(|c| !self.is_container(c))
507 .collect();
508
509 // Find module using per-file bottom-up detection
510 if let Some((depth, module_name)) = self.find_module_for_file(&components, pkg) {
511 info.module_name = Some(module_name.to_string());
512
513 // Reconstruct module root path
514 let mut root = pkg.root.clone();
515 let mut non_container_count = 0;
516 for comp in rel_path
517 .parent()
518 .into_iter()
519 .flat_map(|p| p.components())
520 .filter_map(|c| c.as_os_str().to_str())
521 {
522 root = root.join(comp);
523 if !self.is_container(comp) {
524 if non_container_count == depth {
525 info.module_root = Some(root);
526 break;
527 }
528 non_container_count += 1;
529 }
530 }
531 }
532 // If no module found (file at package root), module_name stays None
533 // and the graph generator will use the file name
534
535 info
536 }
537}