cadi_core/
project_analyzer.rs

1//! Project Analyzer - Whole-project analysis for intelligent chunking
2//!
3//! This module analyzes entire projects to:
4//! - Identify the project structure and layout
5//! - Find shared utilities and common patterns
6//! - Detect composition relationships between files
7//! - Determine optimal chunking for maximum reuse
8
9use crate::atomic::{
10    AliasRegistry, AtomicChunk, ChunkCategory, ChunkComposition,
11    ChunkGranularity, ChunkReference,
12};
13use crate::smart_chunker::{ChunkingStrategy, FileAnalysis, SmartChunker, SmartChunkerConfig};
14use serde::{Deserialize, Serialize};
15use sha2::{Digest, Sha256};
16use std::collections::{HashMap, HashSet};
17use std::path::{Path, PathBuf};
18
19/// Project type detection
20#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
21#[serde(rename_all = "snake_case")]
22pub enum ProjectType {
23    Rust,
24    TypeScript,
25    JavaScript,
26    Python,
27    Go,
28    C,
29    Cpp,
30    Java,
31    Mixed,
32    Unknown,
33}
34
35impl ProjectType {
36    pub fn primary_language(&self) -> &str {
37        match self {
38            ProjectType::Rust => "rust",
39            ProjectType::TypeScript => "typescript",
40            ProjectType::JavaScript => "javascript",
41            ProjectType::Python => "python",
42            ProjectType::Go => "go",
43            ProjectType::C => "c",
44            ProjectType::Cpp => "cpp",
45            ProjectType::Java => "java",
46            ProjectType::Mixed => "mixed",
47            ProjectType::Unknown => "unknown",
48        }
49    }
50}
51
52/// Configuration for project analysis
53#[derive(Debug, Clone, Serialize, Deserialize)]
54pub struct ProjectAnalyzerConfig {
55    /// Directories to ignore
56    #[serde(default = "default_ignore_dirs")]
57    pub ignore_dirs: Vec<String>,
58
59    /// File patterns to ignore
60    #[serde(default = "default_ignore_patterns")]
61    pub ignore_patterns: Vec<String>,
62
63    /// Maximum file size to process (bytes)
64    #[serde(default = "default_max_file_size")]
65    pub max_file_size: usize,
66
67    /// Smart chunker config
68    #[serde(default)]
69    pub chunker_config: SmartChunkerConfig,
70
71    /// Whether to create composition chunks
72    #[serde(default = "default_true")]
73    pub detect_compositions: bool,
74
75    /// Whether to merge small related files
76    #[serde(default = "default_true")]
77    pub merge_small_files: bool,
78
79    /// Minimum files for a composition chunk
80    #[serde(default = "default_min_composition_files")]
81    pub min_composition_files: usize,
82
83    /// Namespace for aliases
84    #[serde(skip_serializing_if = "Option::is_none")]
85    pub namespace: Option<String>,
86}
87
88fn default_ignore_dirs() -> Vec<String> {
89    vec![
90        ".git".to_string(),
91        "node_modules".to_string(),
92        "target".to_string(),
93        "__pycache__".to_string(),
94        ".venv".to_string(),
95        "venv".to_string(),
96        "dist".to_string(),
97        "build".to_string(),
98        ".next".to_string(),
99        ".cache".to_string(),
100        "coverage".to_string(),
101    ]
102}
103
104fn default_ignore_patterns() -> Vec<String> {
105    vec![
106        "*.lock".to_string(),
107        "*.log".to_string(),
108        ".DS_Store".to_string(),
109        "*.min.js".to_string(),
110        "*.min.css".to_string(),
111        "*.map".to_string(),
112    ]
113}
114
115fn default_max_file_size() -> usize {
116    10 * 1024 * 1024 // 10MB
117}
118
119fn default_true() -> bool {
120    true
121}
122
123fn default_min_composition_files() -> usize {
124    2
125}
126
127impl Default for ProjectAnalyzerConfig {
128    fn default() -> Self {
129        Self {
130            ignore_dirs: default_ignore_dirs(),
131            ignore_patterns: default_ignore_patterns(),
132            max_file_size: default_max_file_size(),
133            chunker_config: SmartChunkerConfig::default(),
134            detect_compositions: true,
135            merge_small_files: true,
136            min_composition_files: 2,
137            namespace: None,
138        }
139    }
140}
141
142/// Result of project analysis
143#[derive(Debug, Clone, Serialize, Deserialize)]
144pub struct ProjectAnalysis {
145    /// Root path of project
146    pub root: PathBuf,
147
148    /// Project name
149    pub name: String,
150
151    /// Detected project type
152    pub project_type: ProjectType,
153
154    /// Primary language
155    pub primary_language: String,
156
157    /// All languages found
158    pub languages: Vec<String>,
159
160    /// Total files analyzed
161    pub total_files: usize,
162
163    /// Total lines of code
164    pub total_lines: usize,
165
166    /// File analyses
167    pub files: Vec<FileAnalysis>,
168
169    /// Detected entry points
170    pub entrypoints: Vec<PathBuf>,
171
172    /// Detected modules/packages
173    pub modules: Vec<ModuleInfo>,
174
175    /// Shared utilities (files imported by many)
176    pub shared_utilities: Vec<PathBuf>,
177
178    /// Suggested compositions
179    pub compositions: Vec<CompositionSuggestion>,
180}
181
182/// Information about a module/package
183#[derive(Debug, Clone, Serialize, Deserialize)]
184pub struct ModuleInfo {
185    pub name: String,
186    pub path: PathBuf,
187    pub files: Vec<PathBuf>,
188    pub is_entrypoint: bool,
189    pub exports: Vec<String>,
190    pub category: ChunkCategory,
191}
192
193/// Suggestion for composing multiple chunks
194#[derive(Debug, Clone, Serialize, Deserialize)]
195pub struct CompositionSuggestion {
196    pub name: String,
197    pub description: String,
198    pub files: Vec<PathBuf>,
199    pub category: ChunkCategory,
200    pub reason: String,
201}
202
203/// Import result containing all chunks
204#[derive(Debug, Clone, Serialize, Deserialize)]
205pub struct ImportResult {
206    /// Project analysis
207    pub analysis: ProjectAnalysis,
208
209    /// All atomic chunks created
210    pub chunks: Vec<AtomicChunk>,
211
212    /// Alias registry
213    pub alias_registry: AliasRegistry,
214
215    /// Composition chunks (chunks made of other chunks)
216    pub compositions: Vec<AtomicChunk>,
217
218    /// Import summary
219    pub summary: ImportSummary,
220}
221
222/// Summary of import operation
223#[derive(Debug, Clone, Serialize, Deserialize)]
224pub struct ImportSummary {
225    pub project_name: String,
226    pub project_type: String,
227    pub total_files: usize,
228    pub total_lines: usize,
229    pub atomic_chunks: usize,
230    pub composition_chunks: usize,
231    pub skipped_files: usize,
232    pub categories: HashMap<String, usize>,
233    pub aliases_created: usize,
234    pub duration_ms: u128,
235}
236
237/// The Project Analyzer
238pub struct ProjectAnalyzer {
239    config: ProjectAnalyzerConfig,
240    chunker: SmartChunker,
241}
242
243impl ProjectAnalyzer {
244    /// Create a new analyzer with configuration
245    pub fn new(config: ProjectAnalyzerConfig) -> Self {
246        let chunker = SmartChunker::new(config.chunker_config.clone());
247        Self { config, chunker }
248    }
249
250    /// Create with default configuration
251    pub fn default() -> Self {
252        Self::new(ProjectAnalyzerConfig::default())
253    }
254
255    /// Analyze an entire project
256    pub fn analyze_project(&self, root: &Path) -> std::io::Result<ProjectAnalysis> {
257        let name = root
258            .file_name()
259            .and_then(|n| n.to_str())
260            .unwrap_or("project")
261            .to_string();
262
263        // Collect all files
264        let files = self.collect_files(root)?;
265        let total_files = files.len();
266
267        // Analyze each file
268        let mut file_analyses = Vec::new();
269        let mut total_lines = 0;
270        let mut language_counts: HashMap<String, usize> = HashMap::new();
271        let mut entrypoints = Vec::new();
272
273        for file_path in &files {
274            if let Ok(content) = std::fs::read_to_string(file_path) {
275                let analysis = self.chunker.analyze_file(file_path, &content);
276                total_lines += analysis.total_lines;
277
278                *language_counts.entry(analysis.language.clone()).or_insert(0) += 1;
279
280                if analysis.is_entrypoint {
281                    entrypoints.push(file_path.clone());
282                }
283
284                file_analyses.push(analysis);
285            }
286        }
287
288        // Determine project type and primary language
289        let (project_type, primary_language) = self.detect_project_type(root, &language_counts);
290
291        let languages: Vec<String> = language_counts.keys().cloned().collect();
292
293        // Detect modules
294        let modules = self.detect_modules(root, &file_analyses);
295
296        // Find shared utilities
297        let shared_utilities = self.find_shared_utilities(&file_analyses);
298
299        // Detect composition opportunities
300        let compositions = if self.config.detect_compositions {
301            self.detect_compositions(&file_analyses, &modules)
302        } else {
303            Vec::new()
304        };
305
306        Ok(ProjectAnalysis {
307            root: root.to_path_buf(),
308            name,
309            project_type,
310            primary_language,
311            languages,
312            total_files,
313            total_lines,
314            files: file_analyses,
315            entrypoints,
316            modules,
317            shared_utilities,
318            compositions,
319        })
320    }
321
322    /// Import a project - analyze and create all chunks
323    pub fn import_project(&self, root: &Path) -> std::io::Result<ImportResult> {
324        let start = std::time::Instant::now();
325
326        // Analyze project
327        let analysis = self.analyze_project(root)?;
328
329        // Create chunks
330        let mut chunks = Vec::new();
331        let mut alias_registry = AliasRegistry::new();
332        let mut skipped_files = 0;
333        let mut categories: HashMap<String, usize> = HashMap::new();
334
335        for file_analysis in &analysis.files {
336            if let Ok(content) = std::fs::read_to_string(&file_analysis.path) {
337                let decision = self.chunker.decide_chunking(file_analysis);
338
339                if decision.strategy == ChunkingStrategy::Skip {
340                    skipped_files += 1;
341                    continue;
342                }
343
344                let _relative_path = file_analysis
345                    .path
346                    .clone();
347
348                let file_chunks =
349                    self.chunker
350                        .generate_chunks(&file_analysis.path, &content, &decision);
351
352                for mut chunk in file_chunks {
353                    // Add namespace if configured
354                    if let Some(ref ns) = self.config.namespace {
355                        for alias in &mut chunk.aliases {
356                            alias.namespace = Some(ns.clone());
357                        }
358                    }
359
360                    // Register aliases
361                    for alias in &chunk.aliases {
362                        let alias_path = alias.full_path();
363                        let unique_alias = alias_registry.generate_unique(&alias_path);
364                        alias_registry.register(&unique_alias, &chunk.chunk_id);
365                    }
366
367                    // Count categories
368                    for cat in &chunk.categories {
369                        let cat_str = format!("{:?}", cat);
370                        *categories.entry(cat_str).or_insert(0) += 1;
371                    }
372
373                    chunks.push(chunk);
374                }
375            }
376        }
377
378        // Create composition chunks
379        let mut compositions = Vec::new();
380        for suggestion in &analysis.compositions {
381            if let Some(comp_chunk) =
382                self.create_composition_chunk(suggestion, &chunks, &mut alias_registry)
383            {
384                compositions.push(comp_chunk);
385            }
386        }
387
388        let duration_ms = start.elapsed().as_millis();
389
390        let summary = ImportSummary {
391            project_name: analysis.name.clone(),
392            project_type: format!("{:?}", analysis.project_type),
393            total_files: analysis.total_files,
394            total_lines: analysis.total_lines,
395            atomic_chunks: chunks.len(),
396            composition_chunks: compositions.len(),
397            skipped_files,
398            categories,
399            aliases_created: alias_registry.aliases.len(),
400            duration_ms,
401        };
402
403        Ok(ImportResult {
404            analysis,
405            chunks,
406            alias_registry,
407            compositions,
408            summary,
409        })
410    }
411
412    // ========================================================================
413    // Private helpers
414    // ========================================================================
415
416    fn collect_files(&self, root: &Path) -> std::io::Result<Vec<PathBuf>> {
417        let mut files = Vec::new();
418        self.collect_files_recursive(root, &mut files)?;
419        Ok(files)
420    }
421
422    fn collect_files_recursive(&self, dir: &Path, files: &mut Vec<PathBuf>) -> std::io::Result<()> {
423        if !dir.is_dir() {
424            return Ok(());
425        }
426
427        for entry in std::fs::read_dir(dir)? {
428            let entry = entry?;
429            let path = entry.path();
430
431            if path.is_dir() {
432                let dir_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
433
434                // Skip ignored directories
435                if self.config.ignore_dirs.contains(&dir_name.to_string()) {
436                    continue;
437                }
438                if dir_name.starts_with('.') {
439                    continue;
440                }
441
442                self.collect_files_recursive(&path, files)?;
443            } else if path.is_file() {
444                let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
445
446                // Check against ignore patterns
447                let should_ignore = self.config.ignore_patterns.iter().any(|pattern| {
448                    if pattern.starts_with('*') {
449                        file_name.ends_with(&pattern[1..])
450                    } else {
451                        file_name == pattern
452                    }
453                });
454
455                if should_ignore {
456                    continue;
457                }
458
459                // Check file size
460                if let Ok(metadata) = path.metadata() {
461                    if metadata.len() as usize > self.config.max_file_size {
462                        continue;
463                    }
464                }
465
466                // Check if it's a source file
467                if self.is_source_file(&path) {
468                    files.push(path);
469                }
470            }
471        }
472
473        Ok(())
474    }
475
476    fn is_source_file(&self, path: &Path) -> bool {
477        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
478        matches!(
479            ext,
480            "rs"
481                | "ts"
482                | "tsx"
483                | "js"
484                | "jsx"
485                | "mjs"
486                | "cjs"
487                | "py"
488                | "pyi"
489                | "go"
490                | "c"
491                | "h"
492                | "cpp"
493                | "hpp"
494                | "cc"
495                | "cxx"
496                | "java"
497                | "kt"
498                | "swift"
499                | "rb"
500                | "php"
501                | "scala"
502                | "cs"
503                | "zig"
504                | "md"
505                | "json"
506                | "yaml"
507                | "yml"
508                | "toml"
509                | "html"
510                | "css"
511                | "scss"
512                | "sql"
513                | "sh"
514        )
515    }
516
517    fn detect_project_type(
518        &self,
519        root: &Path,
520        language_counts: &HashMap<String, usize>,
521    ) -> (ProjectType, String) {
522        // Check for project markers
523        if root.join("Cargo.toml").exists() {
524            return (ProjectType::Rust, "rust".to_string());
525        }
526        if root.join("tsconfig.json").exists() {
527            return (ProjectType::TypeScript, "typescript".to_string());
528        }
529        if root.join("package.json").exists() {
530            // Could be TS or JS
531            if language_counts.get("typescript").unwrap_or(&0)
532                > language_counts.get("javascript").unwrap_or(&0)
533            {
534                return (ProjectType::TypeScript, "typescript".to_string());
535            }
536            return (ProjectType::JavaScript, "javascript".to_string());
537        }
538        if root.join("pyproject.toml").exists() || root.join("setup.py").exists() {
539            return (ProjectType::Python, "python".to_string());
540        }
541        if root.join("go.mod").exists() {
542            return (ProjectType::Go, "go".to_string());
543        }
544        if root.join("CMakeLists.txt").exists() || root.join("Makefile").exists() {
545            let cpp_count = language_counts.get("cpp").unwrap_or(&0);
546            let c_count = language_counts.get("c").unwrap_or(&0);
547            if cpp_count > c_count {
548                return (ProjectType::Cpp, "cpp".to_string());
549            }
550            return (ProjectType::C, "c".to_string());
551        }
552        if root.join("pom.xml").exists() || root.join("build.gradle").exists() {
553            return (ProjectType::Java, "java".to_string());
554        }
555
556        // Fallback to most common language
557        let primary = language_counts
558            .iter()
559            .max_by_key(|(_, count)| *count)
560            .map(|(lang, _)| lang.clone())
561            .unwrap_or_else(|| "unknown".to_string());
562
563        if language_counts.len() > 2 {
564            (ProjectType::Mixed, primary)
565        } else {
566            (ProjectType::Unknown, primary)
567        }
568    }
569
570    fn detect_modules(&self, root: &Path, files: &[FileAnalysis]) -> Vec<ModuleInfo> {
571        let mut modules: HashMap<PathBuf, ModuleInfo> = HashMap::new();
572
573        for file in files {
574            // Get the parent directory as the module
575            let module_path = file
576                .path
577                .parent()
578                .unwrap_or(&file.path)
579                .strip_prefix(root)
580                .unwrap_or(file.path.parent().unwrap_or(&file.path));
581
582            let module_name = module_path
583                .file_name()
584                .and_then(|n| n.to_str())
585                .unwrap_or("root")
586                .to_string();
587
588            let entry = modules
589                .entry(module_path.to_path_buf())
590                .or_insert_with(|| ModuleInfo {
591                    name: module_name,
592                    path: module_path.to_path_buf(),
593                    files: Vec::new(),
594                    is_entrypoint: false,
595                    exports: Vec::new(),
596                    category: ChunkCategory::Logic,
597                });
598
599            entry.files.push(file.path.clone());
600            entry.exports.extend(file.exports.clone());
601
602            if file.is_entrypoint {
603                entry.is_entrypoint = true;
604            }
605        }
606
607        modules.into_values().collect()
608    }
609
610    fn find_shared_utilities(&self, files: &[FileAnalysis]) -> Vec<PathBuf> {
611        // Find files that are imported by many others
612        let mut import_counts: HashMap<String, usize> = HashMap::new();
613
614        for file in files {
615            for import in &file.imports {
616                *import_counts.entry(import.clone()).or_insert(0) += 1;
617            }
618        }
619
620        // Files imported by 3+ others are considered shared
621        let shared_imports: HashSet<String> = import_counts
622            .into_iter()
623            .filter(|(_, count)| *count >= 3)
624            .map(|(import, _)| import)
625            .collect();
626
627        // Match imports to actual files (simplified)
628        files
629            .iter()
630            .filter(|f| {
631                let file_stem = f.path.file_stem().and_then(|n| n.to_str()).unwrap_or("");
632                shared_imports.iter().any(|imp| imp.contains(file_stem))
633            })
634            .map(|f| f.path.clone())
635            .collect()
636    }
637
638    fn detect_compositions(
639        &self,
640        files: &[FileAnalysis],
641        modules: &[ModuleInfo],
642    ) -> Vec<CompositionSuggestion> {
643        let mut suggestions = Vec::new();
644
645        // Suggest module-level compositions
646        for module in modules {
647            if module.files.len() >= self.config.min_composition_files {
648                suggestions.push(CompositionSuggestion {
649                    name: format!("{}-module", module.name),
650                    description: format!("Complete {} module", module.name),
651                    files: module.files.clone(),
652                    category: module.category.clone(),
653                    reason: format!(
654                        "Module contains {} related files",
655                        module.files.len()
656                    ),
657                });
658            }
659        }
660
661        // Suggest test compositions
662        let test_files: Vec<PathBuf> = files
663            .iter()
664            .filter(|f| f.is_test)
665            .map(|f| f.path.clone())
666            .collect();
667
668        if test_files.len() >= 2 {
669            suggestions.push(CompositionSuggestion {
670                name: "test-suite".to_string(),
671                description: "Complete test suite".to_string(),
672                files: test_files,
673                category: ChunkCategory::Test,
674                reason: "Grouped all test files together".to_string(),
675            });
676        }
677
678        // Suggest utility compositions
679        let util_files: Vec<PathBuf> = files
680            .iter()
681            .filter(|f| matches!(f.category, ChunkCategory::Utility))
682            .map(|f| f.path.clone())
683            .collect();
684
685        if util_files.len() >= 2 {
686            suggestions.push(CompositionSuggestion {
687                name: "utilities".to_string(),
688                description: "Shared utility functions".to_string(),
689                files: util_files,
690                category: ChunkCategory::Utility,
691                reason: "Grouped utility files together".to_string(),
692            });
693        }
694
695        suggestions
696    }
697
698    fn create_composition_chunk(
699        &self,
700        suggestion: &CompositionSuggestion,
701        chunks: &[AtomicChunk],
702        registry: &mut AliasRegistry,
703    ) -> Option<AtomicChunk> {
704        // Find chunks for the files in this composition
705        let mut component_chunks: Vec<&AtomicChunk> = Vec::new();
706
707        for file in &suggestion.files {
708            let file_str = file.to_string_lossy();
709            for chunk in chunks {
710                if chunk.sources.iter().any(|s| file_str.contains(&s.file)) {
711                    component_chunks.push(chunk);
712                }
713            }
714        }
715
716        if component_chunks.len() < self.config.min_composition_files {
717            return None;
718        }
719
720        // Create composition content hash from component hashes
721        let mut hasher = Sha256::new();
722        for chunk in &component_chunks {
723            hasher.update(chunk.chunk_id.as_bytes());
724        }
725        let content_hash = hex::encode(hasher.finalize());
726        let chunk_id = format!("chunk:sha256:{}", content_hash);
727
728        // Create references to component chunks
729        let composed_of: Vec<ChunkReference> = component_chunks
730            .iter()
731            .map(|c| ChunkReference {
732                chunk_id: c.chunk_id.clone(),
733                alias: c.primary_alias().map(|a| a.full_path()),
734                required: true,
735                imports: c.provides.clone(),
736            })
737            .collect();
738
739        let total_size: usize = component_chunks.iter().map(|c| c.size).sum();
740
741        let alias = registry.generate_unique(&suggestion.name);
742        registry.register(&alias, &chunk_id);
743
744        let mut chunk = AtomicChunk::new(
745            chunk_id,
746            suggestion.name.clone(),
747            "composition".to_string(),
748            content_hash,
749            total_size,
750        )
751        .with_alias(&alias)
752        .with_granularity(ChunkGranularity::Package)
753        .with_categories(vec![suggestion.category.clone()]);
754
755        chunk.description = Some(suggestion.description.clone());
756        chunk.composition = ChunkComposition {
757            composed_of,
758            composed_by: Vec::new(),
759            is_atomic: false,
760            composition_strategy: Some("aggregate".to_string()),
761        };
762
763        // Aggregate concepts from components
764        chunk.concepts = component_chunks
765            .iter()
766            .flat_map(|c| c.concepts.clone())
767            .collect::<HashSet<_>>()
768            .into_iter()
769            .collect();
770
771        chunk.provides = component_chunks
772            .iter()
773            .flat_map(|c| c.provides.clone())
774            .collect::<HashSet<_>>()
775            .into_iter()
776            .collect();
777
778        Some(chunk)
779    }
780}
781
782#[cfg(test)]
783mod tests {
784    use super::*;
785
786    #[test]
787    fn test_project_type_detection() {
788        let analyzer = ProjectAnalyzer::default();
789        let mut counts = HashMap::new();
790        counts.insert("rust".to_string(), 10);
791        counts.insert("toml".to_string(), 2);
792
793        // Without project files, uses language counts
794        let (_project_type, lang) =
795            analyzer.detect_project_type(Path::new("/fake/path"), &counts);
796        assert_eq!(lang, "rust");
797    }
798
799    #[test]
800    fn test_is_source_file() {
801        let analyzer = ProjectAnalyzer::default();
802        assert!(analyzer.is_source_file(Path::new("test.rs")));
803        assert!(analyzer.is_source_file(Path::new("test.ts")));
804        assert!(analyzer.is_source_file(Path::new("test.py")));
805        assert!(!analyzer.is_source_file(Path::new("test.exe")));
806        assert!(!analyzer.is_source_file(Path::new("test.bin")));
807    }
808}