1use crate::atomic::{
10 AliasRegistry, AtomicChunk, ChunkCategory, ChunkComposition,
11 ChunkGranularity, ChunkReference,
12};
13use crate::smart_chunker::{ChunkingStrategy, FileAnalysis, SmartChunker, SmartChunkerConfig};
14use serde::{Deserialize, Serialize};
15use sha2::{Digest, Sha256};
16use std::collections::{HashMap, HashSet};
17use std::path::{Path, PathBuf};
18
19#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
21#[serde(rename_all = "snake_case")]
22pub enum ProjectType {
23 Rust,
24 TypeScript,
25 JavaScript,
26 Python,
27 Go,
28 C,
29 Cpp,
30 Java,
31 Mixed,
32 Unknown,
33}
34
35impl ProjectType {
36 pub fn primary_language(&self) -> &str {
37 match self {
38 ProjectType::Rust => "rust",
39 ProjectType::TypeScript => "typescript",
40 ProjectType::JavaScript => "javascript",
41 ProjectType::Python => "python",
42 ProjectType::Go => "go",
43 ProjectType::C => "c",
44 ProjectType::Cpp => "cpp",
45 ProjectType::Java => "java",
46 ProjectType::Mixed => "mixed",
47 ProjectType::Unknown => "unknown",
48 }
49 }
50}
51
52#[derive(Debug, Clone, Serialize, Deserialize)]
54pub struct ProjectAnalyzerConfig {
55 #[serde(default = "default_ignore_dirs")]
57 pub ignore_dirs: Vec<String>,
58
59 #[serde(default = "default_ignore_patterns")]
61 pub ignore_patterns: Vec<String>,
62
63 #[serde(default = "default_max_file_size")]
65 pub max_file_size: usize,
66
67 #[serde(default)]
69 pub chunker_config: SmartChunkerConfig,
70
71 #[serde(default = "default_true")]
73 pub detect_compositions: bool,
74
75 #[serde(default = "default_true")]
77 pub merge_small_files: bool,
78
79 #[serde(default = "default_min_composition_files")]
81 pub min_composition_files: usize,
82
83 #[serde(skip_serializing_if = "Option::is_none")]
85 pub namespace: Option<String>,
86}
87
88fn default_ignore_dirs() -> Vec<String> {
89 vec![
90 ".git".to_string(),
91 "node_modules".to_string(),
92 "target".to_string(),
93 "__pycache__".to_string(),
94 ".venv".to_string(),
95 "venv".to_string(),
96 "dist".to_string(),
97 "build".to_string(),
98 ".next".to_string(),
99 ".cache".to_string(),
100 "coverage".to_string(),
101 ]
102}
103
104fn default_ignore_patterns() -> Vec<String> {
105 vec![
106 "*.lock".to_string(),
107 "*.log".to_string(),
108 ".DS_Store".to_string(),
109 "*.min.js".to_string(),
110 "*.min.css".to_string(),
111 "*.map".to_string(),
112 ]
113}
114
115fn default_max_file_size() -> usize {
116 10 * 1024 * 1024 }
118
119fn default_true() -> bool {
120 true
121}
122
123fn default_min_composition_files() -> usize {
124 2
125}
126
127impl Default for ProjectAnalyzerConfig {
128 fn default() -> Self {
129 Self {
130 ignore_dirs: default_ignore_dirs(),
131 ignore_patterns: default_ignore_patterns(),
132 max_file_size: default_max_file_size(),
133 chunker_config: SmartChunkerConfig::default(),
134 detect_compositions: true,
135 merge_small_files: true,
136 min_composition_files: 2,
137 namespace: None,
138 }
139 }
140}
141
142#[derive(Debug, Clone, Serialize, Deserialize)]
144pub struct ProjectAnalysis {
145 pub root: PathBuf,
147
148 pub name: String,
150
151 pub project_type: ProjectType,
153
154 pub primary_language: String,
156
157 pub languages: Vec<String>,
159
160 pub total_files: usize,
162
163 pub total_lines: usize,
165
166 pub files: Vec<FileAnalysis>,
168
169 pub entrypoints: Vec<PathBuf>,
171
172 pub modules: Vec<ModuleInfo>,
174
175 pub shared_utilities: Vec<PathBuf>,
177
178 pub compositions: Vec<CompositionSuggestion>,
180}
181
182#[derive(Debug, Clone, Serialize, Deserialize)]
184pub struct ModuleInfo {
185 pub name: String,
186 pub path: PathBuf,
187 pub files: Vec<PathBuf>,
188 pub is_entrypoint: bool,
189 pub exports: Vec<String>,
190 pub category: ChunkCategory,
191}
192
193#[derive(Debug, Clone, Serialize, Deserialize)]
195pub struct CompositionSuggestion {
196 pub name: String,
197 pub description: String,
198 pub files: Vec<PathBuf>,
199 pub category: ChunkCategory,
200 pub reason: String,
201}
202
203#[derive(Debug, Clone, Serialize, Deserialize)]
205pub struct ImportResult {
206 pub analysis: ProjectAnalysis,
208
209 pub chunks: Vec<AtomicChunk>,
211
212 pub alias_registry: AliasRegistry,
214
215 pub compositions: Vec<AtomicChunk>,
217
218 pub summary: ImportSummary,
220}
221
222#[derive(Debug, Clone, Serialize, Deserialize)]
224pub struct ImportSummary {
225 pub project_name: String,
226 pub project_type: String,
227 pub total_files: usize,
228 pub total_lines: usize,
229 pub atomic_chunks: usize,
230 pub composition_chunks: usize,
231 pub skipped_files: usize,
232 pub categories: HashMap<String, usize>,
233 pub aliases_created: usize,
234 pub duration_ms: u128,
235}
236
237pub struct ProjectAnalyzer {
239 config: ProjectAnalyzerConfig,
240 chunker: SmartChunker,
241}
242
243impl ProjectAnalyzer {
244 pub fn new(config: ProjectAnalyzerConfig) -> Self {
246 let chunker = SmartChunker::new(config.chunker_config.clone());
247 Self { config, chunker }
248 }
249
250 pub fn default() -> Self {
252 Self::new(ProjectAnalyzerConfig::default())
253 }
254
255 pub fn analyze_project(&self, root: &Path) -> std::io::Result<ProjectAnalysis> {
257 let name = root
258 .file_name()
259 .and_then(|n| n.to_str())
260 .unwrap_or("project")
261 .to_string();
262
263 let files = self.collect_files(root)?;
265 let total_files = files.len();
266
267 let mut file_analyses = Vec::new();
269 let mut total_lines = 0;
270 let mut language_counts: HashMap<String, usize> = HashMap::new();
271 let mut entrypoints = Vec::new();
272
273 for file_path in &files {
274 if let Ok(content) = std::fs::read_to_string(file_path) {
275 let analysis = self.chunker.analyze_file(file_path, &content);
276 total_lines += analysis.total_lines;
277
278 *language_counts.entry(analysis.language.clone()).or_insert(0) += 1;
279
280 if analysis.is_entrypoint {
281 entrypoints.push(file_path.clone());
282 }
283
284 file_analyses.push(analysis);
285 }
286 }
287
288 let (project_type, primary_language) = self.detect_project_type(root, &language_counts);
290
291 let languages: Vec<String> = language_counts.keys().cloned().collect();
292
293 let modules = self.detect_modules(root, &file_analyses);
295
296 let shared_utilities = self.find_shared_utilities(&file_analyses);
298
299 let compositions = if self.config.detect_compositions {
301 self.detect_compositions(&file_analyses, &modules)
302 } else {
303 Vec::new()
304 };
305
306 Ok(ProjectAnalysis {
307 root: root.to_path_buf(),
308 name,
309 project_type,
310 primary_language,
311 languages,
312 total_files,
313 total_lines,
314 files: file_analyses,
315 entrypoints,
316 modules,
317 shared_utilities,
318 compositions,
319 })
320 }
321
322 pub fn import_project(&self, root: &Path) -> std::io::Result<ImportResult> {
324 let start = std::time::Instant::now();
325
326 let analysis = self.analyze_project(root)?;
328
329 let mut chunks = Vec::new();
331 let mut alias_registry = AliasRegistry::new();
332 let mut skipped_files = 0;
333 let mut categories: HashMap<String, usize> = HashMap::new();
334
335 for file_analysis in &analysis.files {
336 if let Ok(content) = std::fs::read_to_string(&file_analysis.path) {
337 let decision = self.chunker.decide_chunking(file_analysis);
338
339 if decision.strategy == ChunkingStrategy::Skip {
340 skipped_files += 1;
341 continue;
342 }
343
344 let _relative_path = file_analysis
345 .path
346 .clone();
347
348 let file_chunks =
349 self.chunker
350 .generate_chunks(&file_analysis.path, &content, &decision);
351
352 for mut chunk in file_chunks {
353 if let Some(ref ns) = self.config.namespace {
355 for alias in &mut chunk.aliases {
356 alias.namespace = Some(ns.clone());
357 }
358 }
359
360 for alias in &chunk.aliases {
362 let alias_path = alias.full_path();
363 let unique_alias = alias_registry.generate_unique(&alias_path);
364 alias_registry.register(&unique_alias, &chunk.chunk_id);
365 }
366
367 for cat in &chunk.categories {
369 let cat_str = format!("{:?}", cat);
370 *categories.entry(cat_str).or_insert(0) += 1;
371 }
372
373 chunks.push(chunk);
374 }
375 }
376 }
377
378 let mut compositions = Vec::new();
380 for suggestion in &analysis.compositions {
381 if let Some(comp_chunk) =
382 self.create_composition_chunk(suggestion, &chunks, &mut alias_registry)
383 {
384 compositions.push(comp_chunk);
385 }
386 }
387
388 let duration_ms = start.elapsed().as_millis();
389
390 let summary = ImportSummary {
391 project_name: analysis.name.clone(),
392 project_type: format!("{:?}", analysis.project_type),
393 total_files: analysis.total_files,
394 total_lines: analysis.total_lines,
395 atomic_chunks: chunks.len(),
396 composition_chunks: compositions.len(),
397 skipped_files,
398 categories,
399 aliases_created: alias_registry.aliases.len(),
400 duration_ms,
401 };
402
403 Ok(ImportResult {
404 analysis,
405 chunks,
406 alias_registry,
407 compositions,
408 summary,
409 })
410 }
411
412 fn collect_files(&self, root: &Path) -> std::io::Result<Vec<PathBuf>> {
417 let mut files = Vec::new();
418 self.collect_files_recursive(root, &mut files)?;
419 Ok(files)
420 }
421
422 fn collect_files_recursive(&self, dir: &Path, files: &mut Vec<PathBuf>) -> std::io::Result<()> {
423 if !dir.is_dir() {
424 return Ok(());
425 }
426
427 for entry in std::fs::read_dir(dir)? {
428 let entry = entry?;
429 let path = entry.path();
430
431 if path.is_dir() {
432 let dir_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
433
434 if self.config.ignore_dirs.contains(&dir_name.to_string()) {
436 continue;
437 }
438 if dir_name.starts_with('.') {
439 continue;
440 }
441
442 self.collect_files_recursive(&path, files)?;
443 } else if path.is_file() {
444 let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
445
446 let should_ignore = self.config.ignore_patterns.iter().any(|pattern| {
448 if pattern.starts_with('*') {
449 file_name.ends_with(&pattern[1..])
450 } else {
451 file_name == pattern
452 }
453 });
454
455 if should_ignore {
456 continue;
457 }
458
459 if let Ok(metadata) = path.metadata() {
461 if metadata.len() as usize > self.config.max_file_size {
462 continue;
463 }
464 }
465
466 if self.is_source_file(&path) {
468 files.push(path);
469 }
470 }
471 }
472
473 Ok(())
474 }
475
476 fn is_source_file(&self, path: &Path) -> bool {
477 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
478 matches!(
479 ext,
480 "rs"
481 | "ts"
482 | "tsx"
483 | "js"
484 | "jsx"
485 | "mjs"
486 | "cjs"
487 | "py"
488 | "pyi"
489 | "go"
490 | "c"
491 | "h"
492 | "cpp"
493 | "hpp"
494 | "cc"
495 | "cxx"
496 | "java"
497 | "kt"
498 | "swift"
499 | "rb"
500 | "php"
501 | "scala"
502 | "cs"
503 | "zig"
504 | "md"
505 | "json"
506 | "yaml"
507 | "yml"
508 | "toml"
509 | "html"
510 | "css"
511 | "scss"
512 | "sql"
513 | "sh"
514 )
515 }
516
517 fn detect_project_type(
518 &self,
519 root: &Path,
520 language_counts: &HashMap<String, usize>,
521 ) -> (ProjectType, String) {
522 if root.join("Cargo.toml").exists() {
524 return (ProjectType::Rust, "rust".to_string());
525 }
526 if root.join("tsconfig.json").exists() {
527 return (ProjectType::TypeScript, "typescript".to_string());
528 }
529 if root.join("package.json").exists() {
530 if language_counts.get("typescript").unwrap_or(&0)
532 > language_counts.get("javascript").unwrap_or(&0)
533 {
534 return (ProjectType::TypeScript, "typescript".to_string());
535 }
536 return (ProjectType::JavaScript, "javascript".to_string());
537 }
538 if root.join("pyproject.toml").exists() || root.join("setup.py").exists() {
539 return (ProjectType::Python, "python".to_string());
540 }
541 if root.join("go.mod").exists() {
542 return (ProjectType::Go, "go".to_string());
543 }
544 if root.join("CMakeLists.txt").exists() || root.join("Makefile").exists() {
545 let cpp_count = language_counts.get("cpp").unwrap_or(&0);
546 let c_count = language_counts.get("c").unwrap_or(&0);
547 if cpp_count > c_count {
548 return (ProjectType::Cpp, "cpp".to_string());
549 }
550 return (ProjectType::C, "c".to_string());
551 }
552 if root.join("pom.xml").exists() || root.join("build.gradle").exists() {
553 return (ProjectType::Java, "java".to_string());
554 }
555
556 let primary = language_counts
558 .iter()
559 .max_by_key(|(_, count)| *count)
560 .map(|(lang, _)| lang.clone())
561 .unwrap_or_else(|| "unknown".to_string());
562
563 if language_counts.len() > 2 {
564 (ProjectType::Mixed, primary)
565 } else {
566 (ProjectType::Unknown, primary)
567 }
568 }
569
570 fn detect_modules(&self, root: &Path, files: &[FileAnalysis]) -> Vec<ModuleInfo> {
571 let mut modules: HashMap<PathBuf, ModuleInfo> = HashMap::new();
572
573 for file in files {
574 let module_path = file
576 .path
577 .parent()
578 .unwrap_or(&file.path)
579 .strip_prefix(root)
580 .unwrap_or(file.path.parent().unwrap_or(&file.path));
581
582 let module_name = module_path
583 .file_name()
584 .and_then(|n| n.to_str())
585 .unwrap_or("root")
586 .to_string();
587
588 let entry = modules
589 .entry(module_path.to_path_buf())
590 .or_insert_with(|| ModuleInfo {
591 name: module_name,
592 path: module_path.to_path_buf(),
593 files: Vec::new(),
594 is_entrypoint: false,
595 exports: Vec::new(),
596 category: ChunkCategory::Logic,
597 });
598
599 entry.files.push(file.path.clone());
600 entry.exports.extend(file.exports.clone());
601
602 if file.is_entrypoint {
603 entry.is_entrypoint = true;
604 }
605 }
606
607 modules.into_values().collect()
608 }
609
610 fn find_shared_utilities(&self, files: &[FileAnalysis]) -> Vec<PathBuf> {
611 let mut import_counts: HashMap<String, usize> = HashMap::new();
613
614 for file in files {
615 for import in &file.imports {
616 *import_counts.entry(import.clone()).or_insert(0) += 1;
617 }
618 }
619
620 let shared_imports: HashSet<String> = import_counts
622 .into_iter()
623 .filter(|(_, count)| *count >= 3)
624 .map(|(import, _)| import)
625 .collect();
626
627 files
629 .iter()
630 .filter(|f| {
631 let file_stem = f.path.file_stem().and_then(|n| n.to_str()).unwrap_or("");
632 shared_imports.iter().any(|imp| imp.contains(file_stem))
633 })
634 .map(|f| f.path.clone())
635 .collect()
636 }
637
638 fn detect_compositions(
639 &self,
640 files: &[FileAnalysis],
641 modules: &[ModuleInfo],
642 ) -> Vec<CompositionSuggestion> {
643 let mut suggestions = Vec::new();
644
645 for module in modules {
647 if module.files.len() >= self.config.min_composition_files {
648 suggestions.push(CompositionSuggestion {
649 name: format!("{}-module", module.name),
650 description: format!("Complete {} module", module.name),
651 files: module.files.clone(),
652 category: module.category.clone(),
653 reason: format!(
654 "Module contains {} related files",
655 module.files.len()
656 ),
657 });
658 }
659 }
660
661 let test_files: Vec<PathBuf> = files
663 .iter()
664 .filter(|f| f.is_test)
665 .map(|f| f.path.clone())
666 .collect();
667
668 if test_files.len() >= 2 {
669 suggestions.push(CompositionSuggestion {
670 name: "test-suite".to_string(),
671 description: "Complete test suite".to_string(),
672 files: test_files,
673 category: ChunkCategory::Test,
674 reason: "Grouped all test files together".to_string(),
675 });
676 }
677
678 let util_files: Vec<PathBuf> = files
680 .iter()
681 .filter(|f| matches!(f.category, ChunkCategory::Utility))
682 .map(|f| f.path.clone())
683 .collect();
684
685 if util_files.len() >= 2 {
686 suggestions.push(CompositionSuggestion {
687 name: "utilities".to_string(),
688 description: "Shared utility functions".to_string(),
689 files: util_files,
690 category: ChunkCategory::Utility,
691 reason: "Grouped utility files together".to_string(),
692 });
693 }
694
695 suggestions
696 }
697
698 fn create_composition_chunk(
699 &self,
700 suggestion: &CompositionSuggestion,
701 chunks: &[AtomicChunk],
702 registry: &mut AliasRegistry,
703 ) -> Option<AtomicChunk> {
704 let mut component_chunks: Vec<&AtomicChunk> = Vec::new();
706
707 for file in &suggestion.files {
708 let file_str = file.to_string_lossy();
709 for chunk in chunks {
710 if chunk.sources.iter().any(|s| file_str.contains(&s.file)) {
711 component_chunks.push(chunk);
712 }
713 }
714 }
715
716 if component_chunks.len() < self.config.min_composition_files {
717 return None;
718 }
719
720 let mut hasher = Sha256::new();
722 for chunk in &component_chunks {
723 hasher.update(chunk.chunk_id.as_bytes());
724 }
725 let content_hash = hex::encode(hasher.finalize());
726 let chunk_id = format!("chunk:sha256:{}", content_hash);
727
728 let composed_of: Vec<ChunkReference> = component_chunks
730 .iter()
731 .map(|c| ChunkReference {
732 chunk_id: c.chunk_id.clone(),
733 alias: c.primary_alias().map(|a| a.full_path()),
734 required: true,
735 imports: c.provides.clone(),
736 })
737 .collect();
738
739 let total_size: usize = component_chunks.iter().map(|c| c.size).sum();
740
741 let alias = registry.generate_unique(&suggestion.name);
742 registry.register(&alias, &chunk_id);
743
744 let mut chunk = AtomicChunk::new(
745 chunk_id,
746 suggestion.name.clone(),
747 "composition".to_string(),
748 content_hash,
749 total_size,
750 )
751 .with_alias(&alias)
752 .with_granularity(ChunkGranularity::Package)
753 .with_categories(vec![suggestion.category.clone()]);
754
755 chunk.description = Some(suggestion.description.clone());
756 chunk.composition = ChunkComposition {
757 composed_of,
758 composed_by: Vec::new(),
759 is_atomic: false,
760 composition_strategy: Some("aggregate".to_string()),
761 };
762
763 chunk.concepts = component_chunks
765 .iter()
766 .flat_map(|c| c.concepts.clone())
767 .collect::<HashSet<_>>()
768 .into_iter()
769 .collect();
770
771 chunk.provides = component_chunks
772 .iter()
773 .flat_map(|c| c.provides.clone())
774 .collect::<HashSet<_>>()
775 .into_iter()
776 .collect();
777
778 Some(chunk)
779 }
780}
781
782#[cfg(test)]
783mod tests {
784 use super::*;
785
786 #[test]
787 fn test_project_type_detection() {
788 let analyzer = ProjectAnalyzer::default();
789 let mut counts = HashMap::new();
790 counts.insert("rust".to_string(), 10);
791 counts.insert("toml".to_string(), 2);
792
793 let (_project_type, lang) =
795 analyzer.detect_project_type(Path::new("/fake/path"), &counts);
796 assert_eq!(lang, "rust");
797 }
798
799 #[test]
800 fn test_is_source_file() {
801 let analyzer = ProjectAnalyzer::default();
802 assert!(analyzer.is_source_file(Path::new("test.rs")));
803 assert!(analyzer.is_source_file(Path::new("test.ts")));
804 assert!(analyzer.is_source_file(Path::new("test.py")));
805 assert!(!analyzer.is_source_file(Path::new("test.exe")));
806 assert!(!analyzer.is_source_file(Path::new("test.bin")));
807 }
808}