1use once_cell::sync::Lazy;
11use regex::Regex;
12use scribe_core::{Language, Result, ScribeError};
13use serde::{Deserialize, Serialize};
14use std::collections::HashMap;
15use std::path::Path;
16use tree_sitter::{Language as TsLanguage, Node, Parser};
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub enum DetectionStrategy {
21 ExtensionOnly,
23 ExtensionWithContent,
25 FullAnalysis,
27 Custom(CustomDetectionRules),
29}
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct CustomDetectionRules {
34 pub extension_overrides: HashMap<String, Language>,
35 pub filename_patterns: HashMap<String, Language>,
36 pub content_signatures: Vec<ContentSignatureConfig>,
37 pub priority_languages: Vec<Language>,
38}
39
40#[derive(Debug, Clone)]
42pub struct ContentSignature {
43 pub language: Language,
44 pub patterns: Vec<regex::Regex>,
45 pub weight: f32,
46 pub required_matches: usize,
47}
48
49#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct ContentSignatureConfig {
52 pub language: Language,
53 pub patterns: Vec<String>,
54 pub weight: f32,
55 pub required_matches: usize,
56}
57
58#[derive(Debug, Clone, Serialize, Deserialize)]
60pub struct LanguageHints {
61 pub project_type: Option<ProjectType>,
62 pub build_files: Vec<String>,
63 pub directory_structure: Vec<String>,
64 pub dominant_languages: Vec<Language>,
65 pub framework_indicators: Vec<String>,
66}
67
68#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
70pub enum ProjectType {
71 WebFrontend,
72 WebBackend,
73 MobileApp,
74 DesktopApp,
75 SystemsProgram,
76 DataScience,
77 GameDevelopment,
78 EmbeddedSystem,
79 Library,
80 Documentation,
81 Configuration,
82 Unknown,
83}
84
85#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct DetectionResult {
88 pub language: Language,
89 pub confidence: f32,
90 pub detection_method: DetectionMethod,
91 pub alternatives: Vec<(Language, f32)>,
92 pub evidence: Vec<DetectionEvidence>,
93}
94
95#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
97pub enum DetectionMethod {
98 FileExtension,
99 Filename,
100 Shebang,
101 ContentSignature,
102 StatisticalAnalysis,
103 Hybrid,
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct DetectionEvidence {
109 pub evidence_type: EvidenceType,
110 pub description: String,
111 pub weight: f32,
112}
113
114#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
116pub enum EvidenceType {
117 Extension,
118 Filename,
119 Shebang,
120 Keyword,
121 Syntax,
122 Import,
123 Framework,
124 BuildSystem,
125}
126
127pub struct LanguageDetector {
129 strategy: DetectionStrategy,
130 extension_map: HashMap<String, Vec<(Language, f32)>>, filename_patterns: HashMap<String, Language>,
132 content_signatures: HashMap<Language, Vec<ContentSignature>>,
133 shebang_patterns: HashMap<String, Language>,
134 ast_parsers: HashMap<Language, Parser>,
135 syntax_analyzers: HashMap<Language, SyntaxAnalyzer>,
136}
137
138#[derive(Debug, Clone)]
140struct SyntaxAnalyzer {
141 language: Language,
142 keywords: Vec<String>,
143 structural_patterns: Vec<String>, confidence_weights: HashMap<String, f32>,
145}
146
147static TS_LANGUAGES: Lazy<HashMap<Language, fn() -> TsLanguage>> = Lazy::new(|| {
149 let mut languages = HashMap::new();
150 languages.insert(
151 Language::Python,
152 tree_sitter_python::language as fn() -> TsLanguage,
153 );
154 languages.insert(
155 Language::JavaScript,
156 tree_sitter_javascript::language as fn() -> TsLanguage,
157 );
158 languages.insert(
159 Language::TypeScript,
160 tree_sitter_typescript::language_typescript as fn() -> TsLanguage,
161 );
162 languages.insert(
163 Language::Rust,
164 tree_sitter_rust::language as fn() -> TsLanguage,
165 );
166 languages.insert(Language::Go, tree_sitter_go::language as fn() -> TsLanguage);
167 languages
168});
169
170impl Default for DetectionStrategy {
171 fn default() -> Self {
172 DetectionStrategy::ExtensionWithContent
173 }
174}
175
176impl Default for LanguageHints {
177 fn default() -> Self {
178 Self {
179 project_type: None,
180 build_files: Vec::new(),
181 directory_structure: Vec::new(),
182 dominant_languages: Vec::new(),
183 framework_indicators: Vec::new(),
184 }
185 }
186}
187
188impl LanguageDetector {
189 pub fn new() -> Self {
191 let mut detector = Self {
192 strategy: DetectionStrategy::default(),
193 extension_map: HashMap::new(),
194 filename_patterns: HashMap::new(),
195 content_signatures: HashMap::new(),
196 shebang_patterns: HashMap::new(),
197 ast_parsers: HashMap::new(),
198 syntax_analyzers: HashMap::new(),
199 };
200
201 detector.initialize_detection_rules();
202 detector
203 }
204
205 pub fn with_strategy(strategy: DetectionStrategy) -> Self {
207 let mut detector = Self::new();
208 detector.strategy = strategy;
209 detector
210 }
211
212 pub fn detect_language(&self, path: &Path) -> Language {
214 match self.strategy {
215 DetectionStrategy::ExtensionOnly => self.detect_by_extension(path),
216 _ => {
217 self.detect_by_extension_and_filename(path)
220 }
221 }
222 }
223
224 pub fn detect_language_with_content(&mut self, path: &Path, content: &str) -> DetectionResult {
226 match self.strategy {
227 DetectionStrategy::ExtensionOnly => {
228 let language = self.detect_by_extension(path);
229 DetectionResult {
230 language: language.clone(),
231 confidence: if language == Language::Unknown {
232 0.1
233 } else {
234 0.9
235 },
236 detection_method: DetectionMethod::FileExtension,
237 alternatives: vec![],
238 evidence: vec![DetectionEvidence {
239 evidence_type: EvidenceType::Extension,
240 description: format!("File extension: {:?}", path.extension()),
241 weight: 0.9,
242 }],
243 }
244 }
245 DetectionStrategy::ExtensionWithContent => {
246 self.detect_with_content_analysis(path, content)
247 }
248 DetectionStrategy::FullAnalysis => self.detect_with_full_analysis(path, content),
249 DetectionStrategy::Custom(ref rules) => {
250 let rules = rules.clone();
251 self.detect_with_custom_rules(path, content, &rules)
252 }
253 }
254 }
255
256 pub fn detect_with_hints(
258 &mut self,
259 path: &Path,
260 content: &str,
261 hints: &LanguageHints,
262 ) -> DetectionResult {
263 let mut base_result = self.detect_language_with_content(path, content);
264
265 if let Some(project_type) = &hints.project_type {
267 base_result = self.apply_project_type_bias(base_result, project_type);
268 }
269
270 if !hints.dominant_languages.is_empty() {
271 base_result = self.apply_dominant_language_bias(base_result, &hints.dominant_languages);
272 }
273
274 if !hints.framework_indicators.is_empty() {
275 base_result = self.apply_framework_bias(base_result, &hints.framework_indicators);
276 }
277
278 base_result
279 }
280
281 fn initialize_detection_rules(&mut self) {
283 self.initialize_extension_map();
284 self.initialize_filename_patterns();
285 self.initialize_shebang_patterns();
286 self.initialize_content_signatures();
287 self.initialize_ast_parsers();
288 self.initialize_syntax_analyzers();
289 }
290
291 fn initialize_extension_map(&mut self) {
293 let extensions = vec![
294 ("rs", vec![(Language::Rust, 1.0)]),
296 ("py", vec![(Language::Python, 0.95)]),
298 ("pyw", vec![(Language::Python, 1.0)]),
299 ("pyi", vec![(Language::Python, 1.0)]),
300 ("js", vec![(Language::JavaScript, 0.9)]),
302 ("jsx", vec![(Language::JavaScript, 1.0)]),
303 ("mjs", vec![(Language::JavaScript, 1.0)]),
304 ("ts", vec![(Language::TypeScript, 1.0)]),
305 ("tsx", vec![(Language::TypeScript, 1.0)]),
306 ("java", vec![(Language::Java, 1.0)]),
308 ("kt", vec![(Language::Kotlin, 1.0)]),
309 ("kts", vec![(Language::Kotlin, 1.0)]),
310 ("scala", vec![(Language::Scala, 1.0)]),
311 ("sc", vec![(Language::Scala, 0.8)]),
312 ("c", vec![(Language::C, 0.9)]),
314 ("h", vec![(Language::C, 0.7), (Language::Cpp, 0.3)]),
315 ("cpp", vec![(Language::Cpp, 1.0)]),
316 ("cxx", vec![(Language::Cpp, 1.0)]),
317 ("cc", vec![(Language::Cpp, 1.0)]),
318 ("hpp", vec![(Language::Cpp, 1.0)]),
319 ("hxx", vec![(Language::Cpp, 1.0)]),
320 ("cs", vec![(Language::CSharp, 1.0)]),
322 ("go", vec![(Language::Go, 1.0)]),
324 ("rb", vec![(Language::Ruby, 1.0)]),
326 ("rbw", vec![(Language::Ruby, 1.0)]),
327 ("php", vec![(Language::PHP, 1.0)]),
329 ("phtml", vec![(Language::PHP, 1.0)]),
330 ("swift", vec![(Language::Swift, 1.0)]),
332 ("dart", vec![(Language::Dart, 1.0)]),
334 ("sh", vec![(Language::Bash, 1.0)]),
336 ("bash", vec![(Language::Bash, 1.0)]),
337 ("zsh", vec![(Language::Bash, 1.0)]),
338 ("fish", vec![(Language::Bash, 1.0)]),
339 ("html", vec![(Language::HTML, 1.0)]),
341 ("htm", vec![(Language::HTML, 1.0)]),
342 ("css", vec![(Language::CSS, 1.0)]),
343 ("scss", vec![(Language::SCSS, 1.0)]),
344 ("sass", vec![(Language::SASS, 1.0)]),
345 ("md", vec![(Language::Markdown, 1.0)]),
347 ("markdown", vec![(Language::Markdown, 1.0)]),
348 ("xml", vec![(Language::XML, 1.0)]),
349 ("json", vec![(Language::JSON, 1.0)]),
350 ("yaml", vec![(Language::YAML, 1.0)]),
351 ("yml", vec![(Language::YAML, 1.0)]),
352 ("toml", vec![(Language::TOML, 1.0)]),
353 ("ini", vec![(Language::Unknown, 1.0)]),
355 ("cfg", vec![(Language::Unknown, 0.8)]),
356 ("conf", vec![(Language::Unknown, 0.7)]),
357 ("sql", vec![(Language::SQL, 1.0)]),
359 ("rst", vec![(Language::Unknown, 1.0)]),
361 ("tex", vec![(Language::Unknown, 1.0)]),
362 ("r", vec![(Language::R, 1.0)]),
364 ("R", vec![(Language::R, 1.0)]),
365 (
366 "m",
367 vec![(Language::ObjectiveC, 0.6), (Language::Matlab, 0.4)],
368 ),
369 ("mm", vec![(Language::ObjectiveC, 1.0)]),
370 ("pl", vec![(Language::Unknown, 0.8)]),
371 ("pm", vec![(Language::Unknown, 1.0)]),
372 ("lua", vec![(Language::Unknown, 1.0)]),
373 ("vim", vec![(Language::Unknown, 1.0)]),
374 ("hs", vec![(Language::Haskell, 1.0)]),
375 ("lhs", vec![(Language::Haskell, 1.0)]),
376 ];
377
378 for (ext, languages) in extensions {
379 self.extension_map.insert(ext.to_string(), languages);
380 }
381 }
382
383 fn initialize_filename_patterns(&mut self) {
385 let patterns = vec![
386 ("Makefile", Language::Unknown),
387 ("makefile", Language::Unknown),
388 ("Dockerfile", Language::Unknown),
389 ("dockerfile", Language::Unknown),
390 ("Cargo.toml", Language::TOML),
391 ("Cargo.lock", Language::TOML),
392 ("package.json", Language::JSON),
393 ("tsconfig.json", Language::JSON),
394 ("pyproject.toml", Language::TOML),
395 ("setup.py", Language::Python),
396 ("requirements.txt", Language::Unknown),
397 ("README", Language::Unknown),
398 ("LICENSE", Language::Unknown),
399 ("CHANGELOG", Language::Unknown),
400 ("CMakeLists.txt", Language::Unknown),
401 (".gitignore", Language::Unknown),
402 (".dockerignore", Language::Unknown),
403 ("Jenkinsfile", Language::Unknown),
404 ("build.gradle", Language::Unknown),
405 ("pom.xml", Language::XML),
406 ];
407
408 for (filename, language) in patterns {
409 self.filename_patterns
410 .insert(filename.to_string(), language);
411 }
412 }
413
414 fn initialize_shebang_patterns(&mut self) {
416 let patterns = vec![
417 ("python", Language::Python),
418 ("python3", Language::Python),
419 ("python2", Language::Python),
420 ("node", Language::JavaScript),
421 ("bash", Language::Bash),
422 ("sh", Language::Bash),
423 ("zsh", Language::Bash),
424 ("fish", Language::Bash),
425 ("ruby", Language::Ruby),
426 ("php", Language::PHP),
427 ("env python", Language::Python),
428 ("env node", Language::JavaScript),
429 ("env bash", Language::Bash),
430 ("env ruby", Language::Ruby),
431 ];
432
433 for (pattern, language) in patterns {
434 self.shebang_patterns.insert(pattern.to_string(), language);
435 }
436 }
437
438 fn initialize_content_signatures(&mut self) {
440 let python_patterns = vec![
442 r"def\s+\w+\s*\(",
443 r"import\s+\w+",
444 r"from\s+\w+\s+import",
445 r"class\s+\w+\s*\(",
446 r"__\w+__",
447 ];
448 if let Ok(compiled_patterns) = self.compile_patterns(python_patterns) {
449 let python_sigs = vec![ContentSignature {
450 language: Language::Python,
451 patterns: compiled_patterns,
452 weight: 0.9,
453 required_matches: 2,
454 }];
455 self.content_signatures
456 .insert(Language::Python, python_sigs);
457 }
458
459 let js_patterns = vec![
461 r"function\s+\w+\s*\(",
462 r"const\s+\w+\s*=",
463 r"let\s+\w+\s*=",
464 r"=>\s*\{",
465 r"require\s*\(",
466 r"console\.log\s*\(",
467 ];
468 if let Ok(compiled_patterns) = self.compile_patterns(js_patterns) {
469 let js_sigs = vec![ContentSignature {
470 language: Language::JavaScript,
471 patterns: compiled_patterns,
472 weight: 0.8,
473 required_matches: 2,
474 }];
475 self.content_signatures
476 .insert(Language::JavaScript, js_sigs);
477 }
478
479 let rust_patterns = vec![
481 r"fn\s+\w+\s*\(",
482 r"use\s+[\w:]+",
483 r"struct\s+\w+",
484 r"impl\s+[\w<>]+",
485 r"let\s+mut\s+\w+",
486 r"match\s+\w+\s*\{",
487 ];
488 if let Ok(compiled_patterns) = self.compile_patterns(rust_patterns) {
489 let rust_sigs = vec![ContentSignature {
490 language: Language::Rust,
491 patterns: compiled_patterns,
492 weight: 0.95,
493 required_matches: 2,
494 }];
495 self.content_signatures.insert(Language::Rust, rust_sigs);
496 }
497
498 }
500
501 fn compile_patterns(&self, patterns: Vec<&str>) -> Result<Vec<regex::Regex>> {
503 let mut compiled = Vec::new();
504 for pattern in patterns {
505 match regex::Regex::new(pattern) {
506 Ok(regex) => compiled.push(regex),
507 Err(e) => {
508 log::warn!("Failed to compile regex pattern '{}': {}", pattern, e);
509 return Err(ScribeError::pattern(
510 format!("Failed to compile regex pattern: {}", e),
511 pattern.to_string(),
512 ));
513 }
514 }
515 }
516 Ok(compiled)
517 }
518
519 fn initialize_ast_parsers(&mut self) {
521 for (language, ts_lang_fn) in TS_LANGUAGES.iter() {
522 let mut parser = Parser::new();
523 if parser.set_language(ts_lang_fn()).is_ok() {
524 self.ast_parsers.insert(language.clone(), parser);
525 }
526 }
527 }
528
529 fn initialize_syntax_analyzers(&mut self) {
531 let python_analyzer = SyntaxAnalyzer {
533 language: Language::Python,
534 keywords: vec![
535 "def".to_string(),
536 "class".to_string(),
537 "import".to_string(),
538 "from".to_string(),
539 "if".to_string(),
540 "elif".to_string(),
541 ],
542 structural_patterns: vec![
543 "function_definition".to_string(),
544 "class_definition".to_string(),
545 "import_statement".to_string(),
546 "import_from_statement".to_string(),
547 ],
548 confidence_weights: HashMap::from([
549 ("function_definition".to_string(), 0.9),
550 ("class_definition".to_string(), 0.9),
551 ("import_statement".to_string(), 0.8),
552 ]),
553 };
554 self.syntax_analyzers
555 .insert(Language::Python, python_analyzer);
556
557 let js_analyzer = SyntaxAnalyzer {
559 language: Language::JavaScript,
560 keywords: vec![
561 "function".to_string(),
562 "class".to_string(),
563 "import".to_string(),
564 "const".to_string(),
565 "let".to_string(),
566 "var".to_string(),
567 ],
568 structural_patterns: vec![
569 "function_declaration".to_string(),
570 "class_declaration".to_string(),
571 "import_statement".to_string(),
572 "variable_declaration".to_string(),
573 ],
574 confidence_weights: HashMap::from([
575 ("function_declaration".to_string(), 0.9),
576 ("class_declaration".to_string(), 0.9),
577 ("import_statement".to_string(), 0.8),
578 ]),
579 };
580 self.syntax_analyzers
581 .insert(Language::JavaScript, js_analyzer);
582
583 let rust_analyzer = SyntaxAnalyzer {
585 language: Language::Rust,
586 keywords: vec![
587 "fn".to_string(),
588 "struct".to_string(),
589 "enum".to_string(),
590 "impl".to_string(),
591 "use".to_string(),
592 "mod".to_string(),
593 ],
594 structural_patterns: vec![
595 "function_item".to_string(),
596 "struct_item".to_string(),
597 "enum_item".to_string(),
598 "use_declaration".to_string(),
599 ],
600 confidence_weights: HashMap::from([
601 ("function_item".to_string(), 0.9),
602 ("struct_item".to_string(), 0.9),
603 ("use_declaration".to_string(), 0.8),
604 ]),
605 };
606 self.syntax_analyzers.insert(Language::Rust, rust_analyzer);
607 }
608
609 fn detect_by_extension(&self, path: &Path) -> Language {
611 if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
612 if let Some(languages) = self.extension_map.get(&extension.to_lowercase()) {
613 return languages[0].0.clone();
615 }
616 }
617
618 Language::Unknown
619 }
620
621 fn detect_by_extension_and_filename(&self, path: &Path) -> Language {
623 if let Some(filename) = path.file_name().and_then(|name| name.to_str()) {
625 if let Some(language) = self.filename_patterns.get(filename) {
626 return language.clone();
627 }
628 }
629
630 self.detect_by_extension(path)
632 }
633
634 fn detect_with_content_analysis(&mut self, path: &Path, content: &str) -> DetectionResult {
636 let mut candidates = Vec::new();
637 let mut evidence = Vec::new();
638
639 let extension_lang = self.detect_by_extension_and_filename(path);
641 if extension_lang != Language::Unknown {
642 candidates.push((extension_lang.clone(), 0.8));
643 evidence.push(DetectionEvidence {
644 evidence_type: EvidenceType::Extension,
645 description: format!("File extension suggests: {:?}", extension_lang),
646 weight: 0.8,
647 });
648
649 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
651 let confident_extensions = ["rs", "py", "js", "ts", "go", "java", "cpp", "c"];
652 if confident_extensions.contains(&ext) {
653 if self.quick_content_validation(&extension_lang, content) {
655 return DetectionResult {
656 language: extension_lang,
657 confidence: 0.95,
658 detection_method: DetectionMethod::FileExtension,
659 alternatives: vec![],
660 evidence,
661 };
662 }
663 }
664 }
665 }
666
667 if let Some(shebang_lang) = self.detect_by_shebang(content) {
669 candidates.push((shebang_lang.clone(), 0.95));
670 evidence.push(DetectionEvidence {
671 evidence_type: EvidenceType::Shebang,
672 description: format!("Shebang indicates: {:?}", shebang_lang),
673 weight: 0.95,
674 });
675 }
676
677 let signature_results = self.analyze_content_signatures_optimized(content, &extension_lang);
679 for (lang, confidence) in signature_results {
680 candidates.push((lang.clone(), confidence));
681 evidence.push(DetectionEvidence {
682 evidence_type: EvidenceType::Syntax,
683 description: format!("Content signatures match: {:?}", lang),
684 weight: confidence,
685 });
686 }
687
688 let max_confidence = candidates.iter().map(|(_, c)| *c).fold(0.0f32, f32::max);
690 if max_confidence < 0.8 {
691 let import_results = self.analyze_import_patterns(content);
692 for (lang, confidence) in import_results {
693 candidates.push((lang.clone(), confidence));
694 evidence.push(DetectionEvidence {
695 evidence_type: EvidenceType::Import,
696 description: format!("Import patterns match: {:?}", lang),
697 weight: confidence,
698 });
699 }
700 }
701
702 self.aggregate_detection_results(candidates, evidence)
704 }
705
706 fn quick_content_validation(&self, language: &Language, content: &str) -> bool {
708 match language {
709 Language::Rust => {
710 content.contains("fn ") || content.contains("use ") || content.contains("struct ")
711 }
712 Language::Python => {
713 content.contains("def ")
714 || content.contains("import ")
715 || content.contains("class ")
716 }
717 Language::JavaScript => {
718 content.contains("function ")
719 || content.contains("const ")
720 || content.contains("var ")
721 }
722 Language::TypeScript => {
723 content.contains("interface ")
724 || content.contains("type ")
725 || content.contains(": ")
726 }
727 Language::Go => {
728 content.contains("func ")
729 || content.contains("package ")
730 || content.contains("import ")
731 }
732 Language::Java => {
733 content.contains("class ")
734 || content.contains("public ")
735 || content.contains("import ")
736 }
737 Language::C => {
738 content.contains("#include")
739 || content.contains("int main")
740 || content.contains("void ")
741 }
742 Language::Cpp => {
743 content.contains("#include")
744 || content.contains("class ")
745 || content.contains("namespace ")
746 }
747 _ => true, }
749 }
750
751 fn analyze_content_signatures_optimized(
753 &self,
754 content: &str,
755 extension_lang: &Language,
756 ) -> Vec<(Language, f32)> {
757 let mut results = Vec::new();
758
759 if *extension_lang != Language::Unknown {
761 if let Some(signatures) = self.content_signatures.get(extension_lang) {
762 for signature in signatures {
763 let matches = self.count_signature_matches(signature, content);
764 if matches >= signature.required_matches {
765 let confidence =
766 (matches as f32 / signature.patterns.len() as f32) * signature.weight;
767 results.push((extension_lang.clone(), confidence));
768
769 if confidence > 0.7 {
771 return results;
772 }
773 }
774 }
775 }
776 }
777
778 for (language, signatures) in &self.content_signatures {
780 if *language == *extension_lang {
781 continue; }
783
784 for signature in signatures {
785 let matches = self.count_signature_matches(signature, content);
786 if matches >= signature.required_matches {
787 let confidence =
788 (matches as f32 / signature.patterns.len() as f32) * signature.weight;
789 results.push((language.clone(), confidence));
790 }
791 }
792 }
793
794 results
795 }
796
797 fn count_signature_matches(&self, signature: &ContentSignature, content: &str) -> usize {
799 signature
800 .patterns
801 .iter()
802 .map(|regex| regex.find_iter(content).count())
803 .sum::<usize>()
804 }
805
806 fn detect_with_full_analysis(&mut self, path: &Path, content: &str) -> DetectionResult {
808 let mut base_result = self.detect_with_content_analysis(path, content);
809
810 let statistical_results = self.statistical_analysis(content);
812 for (lang, confidence) in statistical_results {
813 base_result.alternatives.push((lang, confidence));
814 }
815
816 base_result
818 .alternatives
819 .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
820
821 base_result
822 }
823
824 fn detect_with_custom_rules(
826 &mut self,
827 path: &Path,
828 content: &str,
829 rules: &CustomDetectionRules,
830 ) -> DetectionResult {
831 let mut candidates = Vec::new();
832 let mut evidence = Vec::new();
833
834 if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
836 if let Some(language) = rules.extension_overrides.get(&extension.to_lowercase()) {
837 candidates.push((language.clone(), 1.0));
838 evidence.push(DetectionEvidence {
839 evidence_type: EvidenceType::Extension,
840 description: format!("Custom extension rule: {} -> {:?}", extension, language),
841 weight: 1.0,
842 });
843 }
844 }
845
846 if let Some(filename) = path.file_name().and_then(|name| name.to_str()) {
848 if let Some(language) = rules.filename_patterns.get(filename) {
849 candidates.push((language.clone(), 1.0));
850 evidence.push(DetectionEvidence {
851 evidence_type: EvidenceType::Filename,
852 description: format!("Custom filename rule: {} -> {:?}", filename, language),
853 weight: 1.0,
854 });
855 }
856 }
857
858 for signature_config in &rules.content_signatures {
860 let matches = signature_config
861 .patterns
862 .iter()
863 .map(|pattern| {
864 match regex::Regex::new(pattern) {
866 Ok(regex) => regex.find_iter(content).count(),
867 Err(_) => content.matches(pattern).count(),
868 }
869 })
870 .sum::<usize>();
871
872 if matches >= signature_config.required_matches {
873 candidates.push((signature_config.language.clone(), signature_config.weight));
874 evidence.push(DetectionEvidence {
875 evidence_type: EvidenceType::Syntax,
876 description: format!(
877 "Custom signature matches for {:?}: {}",
878 signature_config.language, matches
879 ),
880 weight: signature_config.weight,
881 });
882 }
883 }
884
885 if candidates.is_empty() {
887 return self.detect_with_content_analysis(path, content);
888 }
889
890 self.aggregate_detection_results(candidates, evidence)
891 }
892
893 fn detect_by_shebang(&self, content: &str) -> Option<Language> {
895 let lines: Vec<&str> = content.lines().collect();
896 if lines.is_empty() {
897 return None;
898 }
899
900 let first_line = lines[0];
901 if first_line.starts_with("#!") {
902 let shebang_path = &first_line[2..].trim();
903
904 for (pattern, language) in &self.shebang_patterns {
905 if shebang_path.contains(pattern) {
906 return Some(language.clone());
907 }
908 }
909 }
910
911 None
912 }
913
914 fn analyze_content_signatures(&self, content: &str) -> Vec<(Language, f32)> {
916 let mut results = Vec::new();
917
918 for (language, signatures) in &self.content_signatures {
919 for signature in signatures {
920 let matches = signature
921 .patterns
922 .iter()
923 .map(|pattern| {
924 pattern.find_iter(content).count()
926 })
927 .sum::<usize>();
928
929 if matches >= signature.required_matches {
930 let confidence =
931 (matches as f32 / signature.patterns.len() as f32) * signature.weight;
932 results.push((language.clone(), confidence));
933 }
934 }
935 }
936
937 results
938 }
939
940 fn analyze_import_patterns(&mut self, content: &str) -> Vec<(Language, f32)> {
942 let mut results = Vec::new();
943
944 let likely_languages = self.get_likely_languages_from_content(content);
946
947 for language in likely_languages {
948 if let Some(parser) = self.ast_parsers.get_mut(&language) {
949 if let Some(tree) = parser.parse(content, None) {
950 let root_node = tree.root_node();
951 let import_count = self.count_import_nodes(&root_node, &language);
952
953 if import_count > 0 {
954 let confidence = (import_count as f32 / 10.0).min(0.9);
956 results.push((language, confidence));
957
958 if confidence > 0.7 {
960 break;
961 }
962 }
963 }
964 }
965 }
966
967 results
968 }
969
970 fn get_likely_languages_from_content(&self, content: &str) -> Vec<Language> {
972 let mut likely_languages = Vec::new();
973
974 if content.contains("def ") || content.contains("import ") || content.contains("from ") {
976 likely_languages.push(Language::Python);
977 }
978 if content.contains("fn ") || content.contains("use ") || content.contains("struct ") {
979 likely_languages.push(Language::Rust);
980 }
981 if content.contains("function ") || content.contains("const ") || content.contains("let ") {
982 likely_languages.push(Language::JavaScript);
983 }
984 if content.contains("interface ")
985 || content.contains("type ")
986 || content.contains(": string")
987 {
988 likely_languages.push(Language::TypeScript);
989 }
990 if content.contains("func ") || content.contains("package ") {
991 likely_languages.push(Language::Go);
992 }
993
994 if likely_languages.is_empty() {
996 likely_languages = vec![
997 Language::JavaScript,
998 Language::Python,
999 Language::TypeScript,
1000 Language::Rust,
1001 Language::Go,
1002 ];
1003 }
1004
1005 likely_languages
1006 }
1007
1008 fn statistical_analysis(&mut self, content: &str) -> Vec<(Language, f32)> {
1010 let mut results = Vec::new();
1011
1012 let likely_languages = self.get_likely_languages_from_content(content);
1014
1015 for language in likely_languages {
1016 if let Some(analyzer) = self.syntax_analyzers.get(&language) {
1017 if let Some(parser) = self.ast_parsers.get_mut(&language) {
1018 if let Some(tree) = parser.parse(content, None) {
1019 let root_node = tree.root_node();
1020 let structural_score =
1021 self.calculate_structural_score(&root_node, analyzer);
1022
1023 if structural_score > 0.0 {
1024 results.push((language, structural_score));
1025
1026 if structural_score > 0.8 {
1028 break;
1029 }
1030 }
1031 }
1032 }
1033 }
1034 }
1035
1036 results
1037 }
1038
1039 fn count_import_nodes(&self, node: &Node, language: &Language) -> usize {
1041 let mut count = 0;
1042 let import_types: &[&str] = match language {
1043 Language::Python => &["import_statement", "import_from_statement"],
1044 Language::JavaScript | Language::TypeScript => {
1045 &["import_statement", "import_declaration"]
1046 }
1047 Language::Rust => &["use_declaration"],
1048 Language::Go => &["import_spec", "import_declaration"],
1049 Language::Java => &["import_declaration"],
1050 _ => &[],
1051 };
1052
1053 self.count_nodes_recursive(node, import_types, &mut count);
1054 count
1055 }
1056
1057 fn calculate_structural_score(&self, node: &Node, analyzer: &SyntaxAnalyzer) -> f32 {
1059 let mut score = 0.0;
1060
1061 for pattern in &analyzer.structural_patterns {
1062 let count = self.count_specific_nodes(node, pattern);
1063 if count > 0 {
1064 let weight = analyzer.confidence_weights.get(pattern).unwrap_or(&0.5);
1065 score += (count as f32) * weight;
1066 }
1067 }
1068
1069 (score / 10.0).min(1.0)
1071 }
1072
1073 fn count_nodes_recursive(&self, node: &Node, target_types: &[&str], count: &mut usize) {
1075 if target_types.contains(&node.kind()) {
1076 *count += 1;
1077 }
1078
1079 for i in 0..node.child_count() {
1080 if let Some(child) = node.child(i) {
1081 self.count_nodes_recursive(&child, target_types, count);
1082 }
1083 }
1084 }
1085
1086 fn count_specific_nodes(&self, node: &Node, target_type: &str) -> usize {
1088 let mut count = 0;
1089 self.count_nodes_recursive(node, &[target_type], &mut count);
1090 count
1091 }
1092
1093 fn aggregate_detection_results(
1095 &self,
1096 candidates: Vec<(Language, f32)>,
1097 evidence: Vec<DetectionEvidence>,
1098 ) -> DetectionResult {
1099 if candidates.is_empty() {
1100 return DetectionResult {
1101 language: Language::Unknown,
1102 confidence: 0.0,
1103 detection_method: DetectionMethod::FileExtension,
1104 alternatives: vec![],
1105 evidence,
1106 };
1107 }
1108
1109 let mut language_scores: HashMap<Language, f32> = HashMap::new();
1111 let mut methods_used: Vec<DetectionMethod> = Vec::new();
1112
1113 for (lang, confidence) in &candidates {
1114 *language_scores.entry(lang.clone()).or_insert(0.0) += confidence;
1115 }
1116
1117 let (best_language, best_confidence) = language_scores
1119 .iter()
1120 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
1121 .map(|(lang, conf)| (lang.clone(), *conf))
1122 .unwrap_or((Language::Unknown, 0.0));
1123
1124 let normalized_confidence = best_confidence.min(1.0);
1126
1127 let mut alternatives: Vec<(Language, f32)> = language_scores
1129 .into_iter()
1130 .filter(|(lang, _)| *lang != best_language)
1131 .collect();
1132 alternatives.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
1133
1134 let detection_method = if evidence
1136 .iter()
1137 .any(|e| e.evidence_type == EvidenceType::Shebang)
1138 {
1139 DetectionMethod::Shebang
1140 } else if evidence
1141 .iter()
1142 .any(|e| e.evidence_type == EvidenceType::Syntax)
1143 {
1144 DetectionMethod::ContentSignature
1145 } else if evidence
1146 .iter()
1147 .any(|e| e.evidence_type == EvidenceType::Extension)
1148 {
1149 DetectionMethod::FileExtension
1150 } else {
1151 DetectionMethod::Hybrid
1152 };
1153
1154 DetectionResult {
1155 language: best_language,
1156 confidence: normalized_confidence,
1157 detection_method,
1158 alternatives,
1159 evidence,
1160 }
1161 }
1162
1163 fn apply_project_type_bias(
1165 &self,
1166 mut result: DetectionResult,
1167 project_type: &ProjectType,
1168 ) -> DetectionResult {
1169 let bias_factor = 0.25;
1170
1171 match project_type {
1172 ProjectType::WebFrontend => {
1173 if matches!(
1174 result.language,
1175 Language::JavaScript | Language::TypeScript | Language::HTML | Language::CSS
1176 ) {
1177 result.confidence += bias_factor;
1178 }
1179 }
1180 ProjectType::WebBackend => {
1181 if matches!(
1182 result.language,
1183 Language::Python
1184 | Language::JavaScript
1185 | Language::TypeScript
1186 | Language::Java
1187 | Language::Go
1188 | Language::Rust
1189 ) {
1190 result.confidence += bias_factor;
1191 }
1192 }
1193 ProjectType::SystemsProgram => {
1194 if matches!(
1195 result.language,
1196 Language::Rust | Language::C | Language::Cpp | Language::Go
1197 ) {
1198 result.confidence += bias_factor;
1199 }
1200 }
1201 ProjectType::DataScience => {
1202 if matches!(
1203 result.language,
1204 Language::Python | Language::R | Language::SQL
1205 ) {
1206 result.confidence += bias_factor;
1207 }
1208 }
1209 _ => {}
1210 }
1211
1212 result.confidence = result.confidence.min(1.0);
1213 result
1214 }
1215
1216 fn apply_dominant_language_bias(
1218 &self,
1219 mut result: DetectionResult,
1220 dominant_languages: &[Language],
1221 ) -> DetectionResult {
1222 if dominant_languages.contains(&result.language) {
1223 result.confidence += 0.15;
1224 result.confidence = result.confidence.min(1.0);
1225 }
1226 result
1227 }
1228
1229 fn apply_framework_bias(
1231 &self,
1232 mut result: DetectionResult,
1233 framework_indicators: &[String],
1234 ) -> DetectionResult {
1235 for indicator in framework_indicators {
1238 match indicator.as_str() {
1239 "package.json" | "node_modules" => {
1240 if matches!(result.language, Language::JavaScript | Language::TypeScript) {
1241 result.confidence += 0.1;
1242 }
1243 }
1244 "Cargo.toml" | "Cargo.lock" => {
1245 if result.language == Language::Rust {
1246 result.confidence += 0.1;
1247 }
1248 }
1249 "requirements.txt" | "__pycache__" | ".pyc" => {
1250 if result.language == Language::Python {
1251 result.confidence += 0.1;
1252 }
1253 }
1254 _ => {}
1255 }
1256 }
1257
1258 result.confidence = result.confidence.min(1.0);
1259 result
1260 }
1261}
1262
1263impl Default for LanguageDetector {
1264 fn default() -> Self {
1265 Self::new()
1266 }
1267}
1268
1269#[cfg(test)]
1270mod tests {
1271 use super::*;
1272 use std::path::PathBuf;
1273
1274 #[test]
1275 fn test_extension_detection() {
1276 let detector = LanguageDetector::new();
1277
1278 assert_eq!(
1279 detector.detect_language(Path::new("test.rs")),
1280 Language::Rust
1281 );
1282 assert_eq!(
1283 detector.detect_language(Path::new("test.py")),
1284 Language::Python
1285 );
1286 assert_eq!(
1287 detector.detect_language(Path::new("test.js")),
1288 Language::JavaScript
1289 );
1290 assert_eq!(
1291 detector.detect_language(Path::new("test.ts")),
1292 Language::TypeScript
1293 );
1294 assert_eq!(
1295 detector.detect_language(Path::new("test.java")),
1296 Language::Java
1297 );
1298 assert_eq!(detector.detect_language(Path::new("test.go")), Language::Go);
1299 assert_eq!(
1300 detector.detect_language(Path::new("test.cpp")),
1301 Language::Cpp
1302 );
1303 assert_eq!(detector.detect_language(Path::new("test.c")), Language::C);
1304 }
1305
1306 #[test]
1307 fn test_rust_files_are_programming() {
1308 let detector = LanguageDetector::new();
1309
1310 let rust_files = [
1312 "src/lib.rs",
1313 "scribe-rs/src/lib.rs",
1314 "scribe-rs/scribe-core/src/lib.rs",
1315 "main.rs",
1316 "mod.rs",
1317 ];
1318
1319 for file_path in &rust_files {
1320 let language = detector.detect_language(Path::new(file_path));
1321 assert_eq!(language, Language::Rust, "Failed for file: {}", file_path);
1322 assert!(
1323 language.is_programming(),
1324 "Rust should be programming language for file: {}",
1325 file_path
1326 );
1327 }
1328 }
1329
1330 #[test]
1331 fn test_filename_patterns() {
1332 let mut detector = LanguageDetector::new();
1333
1334 assert_eq!(
1335 detector.detect_language(Path::new("Makefile")),
1336 Language::Unknown
1337 );
1338 assert_eq!(
1339 detector.detect_language(Path::new("Dockerfile")),
1340 Language::Unknown
1341 );
1342 assert_eq!(
1343 detector.detect_language(Path::new("Cargo.toml")),
1344 Language::TOML
1345 );
1346 assert_eq!(
1347 detector.detect_language(Path::new("package.json")),
1348 Language::JSON
1349 );
1350 }
1351
1352 #[test]
1353 fn test_shebang_detection() {
1354 let mut detector = LanguageDetector::new();
1355
1356 let python_script = "#!/usr/bin/env python3\nprint('Hello, world!')";
1357 let result = detector.detect_language_with_content(Path::new("script"), python_script);
1358 assert_eq!(result.language, Language::Python);
1359 assert!(result.confidence > 0.9);
1360 assert_eq!(result.detection_method, DetectionMethod::Shebang);
1361
1362 let bash_script = "#!/bin/bash\necho 'Hello, world!'";
1363 let result = detector.detect_language_with_content(Path::new("script"), bash_script);
1364 assert_eq!(result.language, Language::Bash);
1365 assert!(result.confidence > 0.9);
1366 }
1367
1368 #[test]
1369 fn test_content_signature_detection() {
1370 let mut detector = LanguageDetector::new();
1371
1372 let python_code = r#"
1373def hello_world():
1374 print("Hello, world!")
1375
1376class MyClass:
1377 def __init__(self):
1378 pass
1379
1380import sys
1381from collections import defaultdict
1382 "#;
1383
1384 let result = detector.detect_language_with_content(Path::new("unknown"), python_code);
1385 assert_eq!(result.language, Language::Python);
1386 assert!(result.confidence > 0.5);
1387
1388 let rust_code = r#"
1389fn main() {
1390 println!("Hello, world!");
1391}
1392
1393struct MyStruct {
1394 field: i32,
1395}
1396
1397impl MyStruct {
1398 fn new() -> Self {
1399 MyStruct { field: 0 }
1400 }
1401}
1402
1403use std::collections::HashMap;
1404 "#;
1405
1406 let result = detector.detect_language_with_content(Path::new("unknown"), rust_code);
1407 assert_eq!(result.language, Language::Rust);
1408 assert!(result.confidence > 0.5);
1409 }
1410
1411 #[test]
1412 fn test_import_pattern_detection() {
1413 let mut detector = LanguageDetector::new();
1414
1415 let js_code = r#"
1416import React from 'react';
1417import { useState } from 'react';
1418const fs = require('fs');
1419 "#;
1420
1421 let result = detector.detect_language_with_content(Path::new("unknown"), js_code);
1422 assert_eq!(result.language, Language::JavaScript);
1423
1424 let python_code = r#"
1425import os
1426import sys
1427from collections import defaultdict, Counter
1428 "#;
1429
1430 let result = detector.detect_language_with_content(Path::new("unknown"), python_code);
1431 assert_eq!(result.language, Language::Python);
1432 }
1433
1434 #[test]
1435 fn test_hybrid_detection() {
1436 let mut detector = LanguageDetector::new();
1437
1438 let python_code = "#!/usr/bin/env python\nprint('Hello')\n# Python comment";
1441 let result = detector.detect_language_with_content(Path::new("test.py"), python_code);
1442 assert_eq!(result.language, Language::Python);
1443 assert!(result.confidence > 0.6); assert!(result.evidence.len() > 1);
1445
1446 let python_code = "def hello(): print('Hello')";
1448 let result = detector.detect_language_with_content(Path::new("test.js"), python_code);
1449 assert!(result.language == Language::Python || result.language == Language::JavaScript);
1452 }
1453
1454 #[test]
1455 fn test_detection_with_hints() {
1456 let mut detector = LanguageDetector::new();
1457
1458 let hints = LanguageHints {
1459 project_type: Some(ProjectType::WebFrontend),
1460 dominant_languages: vec![Language::TypeScript],
1461 framework_indicators: vec!["package.json".to_string()],
1462 ..Default::default()
1463 };
1464
1465 let ts_code = "const hello = () => console.log('Hello');";
1466 let result = detector.detect_with_hints(Path::new("unknown"), ts_code, &hints);
1467
1468 assert_eq!(result.language, Language::JavaScript); assert!(result.confidence > 0.5);
1471 }
1472
1473 #[test]
1474 fn test_custom_detection_rules() {
1475 let mut custom_rules = CustomDetectionRules {
1476 extension_overrides: HashMap::new(),
1477 filename_patterns: HashMap::new(),
1478 content_signatures: vec![],
1479 priority_languages: vec![],
1480 };
1481
1482 custom_rules
1484 .extension_overrides
1485 .insert("myext".to_string(), Language::Rust);
1486
1487 let mut detector = LanguageDetector::with_strategy(DetectionStrategy::Custom(custom_rules));
1488
1489 let result = detector.detect_language_with_content(Path::new("test.myext"), "some content");
1490 assert_eq!(result.language, Language::Rust);
1491 assert_eq!(result.confidence, 1.0);
1492 }
1493
1494 #[test]
1495 fn test_detection_evidence() {
1496 let mut detector = LanguageDetector::new();
1497
1498 let python_code = "#!/usr/bin/env python\nprint('Hello World')";
1500 let result = detector.detect_language_with_content(Path::new("test.py"), python_code);
1501
1502 assert!(result.evidence.len() >= 2);
1504 assert!(result
1505 .evidence
1506 .iter()
1507 .any(|e| e.evidence_type == EvidenceType::Shebang));
1508 assert!(result
1509 .evidence
1510 .iter()
1511 .any(|e| e.evidence_type == EvidenceType::Extension));
1512 }
1513
1514 #[test]
1515 fn test_confidence_scoring() {
1516 let mut detector = LanguageDetector::new();
1517
1518 let strong_python = "#!/usr/bin/env python3\nimport os\ndef main(): pass\nclass Test: pass";
1520 let result = detector.detect_language_with_content(Path::new("test.py"), strong_python);
1521 assert!(result.confidence > 0.8);
1522
1523 let weak_indicators = "hello world";
1526 let result = detector.detect_language_with_content(Path::new("test.txt"), weak_indicators);
1527 assert!(result.confidence < 0.8);
1528 }
1529
1530 #[test]
1531 fn test_alternatives_ranking() {
1532 let mut detector = LanguageDetector::new();
1533
1534 let ambiguous_code = "print hello"; let result = detector.detect_language_with_content(Path::new("unknown"), ambiguous_code);
1536
1537 if result.alternatives.len() > 1 {
1539 assert!(result.alternatives[0].1 >= result.alternatives[1].1);
1540 }
1541 }
1542}