1use scribe_core::{Language, Result};
11use std::path::Path;
12use std::collections::HashMap;
13use once_cell::sync::Lazy;
14use serde::{Serialize, Deserialize};
15use tree_sitter::{Parser, Language as TsLanguage, Node};
16use regex::Regex;
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub enum DetectionStrategy {
21 ExtensionOnly,
23 ExtensionWithContent,
25 FullAnalysis,
27 Custom(CustomDetectionRules),
29}
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct CustomDetectionRules {
34 pub extension_overrides: HashMap<String, Language>,
35 pub filename_patterns: HashMap<String, Language>,
36 pub content_signatures: Vec<ContentSignature>,
37 pub priority_languages: Vec<Language>,
38}
39
40#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct ContentSignature {
43 pub language: Language,
44 pub patterns: Vec<String>,
45 pub weight: f32,
46 pub required_matches: usize,
47}
48
49#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct LanguageHints {
52 pub project_type: Option<ProjectType>,
53 pub build_files: Vec<String>,
54 pub directory_structure: Vec<String>,
55 pub dominant_languages: Vec<Language>,
56 pub framework_indicators: Vec<String>,
57}
58
59#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
61pub enum ProjectType {
62 WebFrontend,
63 WebBackend,
64 MobileApp,
65 DesktopApp,
66 SystemsProgram,
67 DataScience,
68 GameDevelopment,
69 EmbeddedSystem,
70 Library,
71 Documentation,
72 Configuration,
73 Unknown,
74}
75
76#[derive(Debug, Clone, Serialize, Deserialize)]
78pub struct DetectionResult {
79 pub language: Language,
80 pub confidence: f32,
81 pub detection_method: DetectionMethod,
82 pub alternatives: Vec<(Language, f32)>,
83 pub evidence: Vec<DetectionEvidence>,
84}
85
86#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
88pub enum DetectionMethod {
89 FileExtension,
90 Filename,
91 Shebang,
92 ContentSignature,
93 StatisticalAnalysis,
94 Hybrid,
95}
96
97#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct DetectionEvidence {
100 pub evidence_type: EvidenceType,
101 pub description: String,
102 pub weight: f32,
103}
104
105#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
107pub enum EvidenceType {
108 Extension,
109 Filename,
110 Shebang,
111 Keyword,
112 Syntax,
113 Import,
114 Framework,
115 BuildSystem,
116}
117
118pub struct LanguageDetector {
120 strategy: DetectionStrategy,
121 extension_map: HashMap<String, Vec<(Language, f32)>>, filename_patterns: HashMap<String, Language>,
123 content_signatures: HashMap<Language, Vec<ContentSignature>>,
124 shebang_patterns: HashMap<String, Language>,
125 ast_parsers: HashMap<Language, Parser>,
126 syntax_analyzers: HashMap<Language, SyntaxAnalyzer>,
127}
128
129#[derive(Debug, Clone)]
131struct SyntaxAnalyzer {
132 language: Language,
133 keywords: Vec<String>,
134 structural_patterns: Vec<String>, confidence_weights: HashMap<String, f32>,
136}
137
138static TS_LANGUAGES: Lazy<HashMap<Language, fn() -> TsLanguage>> = Lazy::new(|| {
140 let mut languages = HashMap::new();
141 languages.insert(Language::Python, tree_sitter_python::language as fn() -> TsLanguage);
142 languages.insert(Language::JavaScript, tree_sitter_javascript::language as fn() -> TsLanguage);
143 languages.insert(Language::TypeScript, tree_sitter_typescript::language_typescript as fn() -> TsLanguage);
144 languages.insert(Language::Rust, tree_sitter_rust::language as fn() -> TsLanguage);
145 languages.insert(Language::Go, tree_sitter_go::language as fn() -> TsLanguage);
146 languages
147});
148
149impl Default for DetectionStrategy {
150 fn default() -> Self {
151 DetectionStrategy::ExtensionWithContent
152 }
153}
154
155impl Default for LanguageHints {
156 fn default() -> Self {
157 Self {
158 project_type: None,
159 build_files: Vec::new(),
160 directory_structure: Vec::new(),
161 dominant_languages: Vec::new(),
162 framework_indicators: Vec::new(),
163 }
164 }
165}
166
167impl LanguageDetector {
168 pub fn new() -> Self {
170 let mut detector = Self {
171 strategy: DetectionStrategy::default(),
172 extension_map: HashMap::new(),
173 filename_patterns: HashMap::new(),
174 content_signatures: HashMap::new(),
175 shebang_patterns: HashMap::new(),
176 ast_parsers: HashMap::new(),
177 syntax_analyzers: HashMap::new(),
178 };
179
180 detector.initialize_detection_rules();
181 detector
182 }
183
184 pub fn with_strategy(strategy: DetectionStrategy) -> Self {
186 let mut detector = Self::new();
187 detector.strategy = strategy;
188 detector
189 }
190
191 pub fn detect_language(&self, path: &Path) -> Language {
193 match self.strategy {
194 DetectionStrategy::ExtensionOnly => {
195 self.detect_by_extension(path)
196 }
197 _ => {
198 self.detect_by_extension_and_filename(path)
201 }
202 }
203 }
204
205 pub fn detect_language_with_content(&mut self, path: &Path, content: &str) -> DetectionResult {
207 match self.strategy {
208 DetectionStrategy::ExtensionOnly => {
209 let language = self.detect_by_extension(path);
210 DetectionResult {
211 language: language.clone(),
212 confidence: if language == Language::Unknown { 0.1 } else { 0.9 },
213 detection_method: DetectionMethod::FileExtension,
214 alternatives: vec![],
215 evidence: vec![DetectionEvidence {
216 evidence_type: EvidenceType::Extension,
217 description: format!("File extension: {:?}", path.extension()),
218 weight: 0.9,
219 }],
220 }
221 }
222 DetectionStrategy::ExtensionWithContent => {
223 self.detect_with_content_analysis(path, content)
224 }
225 DetectionStrategy::FullAnalysis => {
226 self.detect_with_full_analysis(path, content)
227 }
228 DetectionStrategy::Custom(ref rules) => {
229 let rules = rules.clone();
230 self.detect_with_custom_rules(path, content, &rules)
231 }
232 }
233 }
234
235 pub fn detect_with_hints(&mut self, path: &Path, content: &str, hints: &LanguageHints) -> DetectionResult {
237 let mut base_result = self.detect_language_with_content(path, content);
238
239 if let Some(project_type) = &hints.project_type {
241 base_result = self.apply_project_type_bias(base_result, project_type);
242 }
243
244 if !hints.dominant_languages.is_empty() {
245 base_result = self.apply_dominant_language_bias(base_result, &hints.dominant_languages);
246 }
247
248 if !hints.framework_indicators.is_empty() {
249 base_result = self.apply_framework_bias(base_result, &hints.framework_indicators);
250 }
251
252 base_result
253 }
254
255 fn initialize_detection_rules(&mut self) {
257 self.initialize_extension_map();
258 self.initialize_filename_patterns();
259 self.initialize_shebang_patterns();
260 self.initialize_content_signatures();
261 self.initialize_ast_parsers();
262 self.initialize_syntax_analyzers();
263 }
264
265 fn initialize_extension_map(&mut self) {
267 let extensions = vec![
268 ("rs", vec![(Language::Rust, 1.0)]),
270
271 ("py", vec![(Language::Python, 0.95)]),
273 ("pyw", vec![(Language::Python, 1.0)]),
274 ("pyi", vec![(Language::Python, 1.0)]),
275
276 ("js", vec![(Language::JavaScript, 0.9)]),
278 ("jsx", vec![(Language::JavaScript, 1.0)]),
279 ("mjs", vec![(Language::JavaScript, 1.0)]),
280 ("ts", vec![(Language::TypeScript, 1.0)]),
281 ("tsx", vec![(Language::TypeScript, 1.0)]),
282
283 ("java", vec![(Language::Java, 1.0)]),
285 ("kt", vec![(Language::Kotlin, 1.0)]),
286 ("kts", vec![(Language::Kotlin, 1.0)]),
287 ("scala", vec![(Language::Scala, 1.0)]),
288 ("sc", vec![(Language::Scala, 0.8)]),
289
290 ("c", vec![(Language::C, 0.9)]),
292 ("h", vec![(Language::C, 0.7), (Language::Cpp, 0.3)]),
293 ("cpp", vec![(Language::Cpp, 1.0)]),
294 ("cxx", vec![(Language::Cpp, 1.0)]),
295 ("cc", vec![(Language::Cpp, 1.0)]),
296 ("hpp", vec![(Language::Cpp, 1.0)]),
297 ("hxx", vec![(Language::Cpp, 1.0)]),
298
299 ("cs", vec![(Language::CSharp, 1.0)]),
301
302 ("go", vec![(Language::Go, 1.0)]),
304
305 ("rb", vec![(Language::Ruby, 1.0)]),
307 ("rbw", vec![(Language::Ruby, 1.0)]),
308
309 ("php", vec![(Language::PHP, 1.0)]),
311 ("phtml", vec![(Language::PHP, 1.0)]),
312
313 ("swift", vec![(Language::Swift, 1.0)]),
315
316 ("dart", vec![(Language::Dart, 1.0)]),
318
319 ("sh", vec![(Language::Bash, 1.0)]),
321 ("bash", vec![(Language::Bash, 1.0)]),
322 ("zsh", vec![(Language::Bash, 1.0)]),
323 ("fish", vec![(Language::Bash, 1.0)]),
324
325 ("html", vec![(Language::HTML, 1.0)]),
327 ("htm", vec![(Language::HTML, 1.0)]),
328 ("css", vec![(Language::CSS, 1.0)]),
329 ("scss", vec![(Language::SCSS, 1.0)]),
330 ("sass", vec![(Language::SASS, 1.0)]),
331
332 ("md", vec![(Language::Markdown, 1.0)]),
334 ("markdown", vec![(Language::Markdown, 1.0)]),
335 ("xml", vec![(Language::XML, 1.0)]),
336 ("json", vec![(Language::JSON, 1.0)]),
337 ("yaml", vec![(Language::YAML, 1.0)]),
338 ("yml", vec![(Language::YAML, 1.0)]),
339 ("toml", vec![(Language::TOML, 1.0)]),
340
341 ("ini", vec![(Language::Unknown, 1.0)]),
343 ("cfg", vec![(Language::Unknown, 0.8)]),
344 ("conf", vec![(Language::Unknown, 0.7)]),
345
346 ("sql", vec![(Language::SQL, 1.0)]),
348
349 ("rst", vec![(Language::Unknown, 1.0)]),
351 ("tex", vec![(Language::Unknown, 1.0)]),
352
353 ("r", vec![(Language::R, 1.0)]),
355 ("R", vec![(Language::R, 1.0)]),
356 ("m", vec![(Language::ObjectiveC, 0.6), (Language::Matlab, 0.4)]),
357 ("mm", vec![(Language::ObjectiveC, 1.0)]),
358 ("pl", vec![(Language::Unknown, 0.8)]),
359 ("pm", vec![(Language::Unknown, 1.0)]),
360 ("lua", vec![(Language::Unknown, 1.0)]),
361 ("vim", vec![(Language::Unknown, 1.0)]),
362 ("hs", vec![(Language::Haskell, 1.0)]),
363 ("lhs", vec![(Language::Haskell, 1.0)]),
364 ];
365
366 for (ext, languages) in extensions {
367 self.extension_map.insert(ext.to_string(), languages);
368 }
369 }
370
371 fn initialize_filename_patterns(&mut self) {
373 let patterns = vec![
374 ("Makefile", Language::Unknown),
375 ("makefile", Language::Unknown),
376 ("Dockerfile", Language::Unknown),
377 ("dockerfile", Language::Unknown),
378 ("Cargo.toml", Language::TOML),
379 ("Cargo.lock", Language::TOML),
380 ("package.json", Language::JSON),
381 ("tsconfig.json", Language::JSON),
382 ("pyproject.toml", Language::TOML),
383 ("setup.py", Language::Python),
384 ("requirements.txt", Language::Unknown),
385 ("README", Language::Unknown),
386 ("LICENSE", Language::Unknown),
387 ("CHANGELOG", Language::Unknown),
388 ("CMakeLists.txt", Language::Unknown),
389 (".gitignore", Language::Unknown),
390 (".dockerignore", Language::Unknown),
391 ("Jenkinsfile", Language::Unknown),
392 ("build.gradle", Language::Unknown),
393 ("pom.xml", Language::XML),
394 ];
395
396 for (filename, language) in patterns {
397 self.filename_patterns.insert(filename.to_string(), language);
398 }
399 }
400
401 fn initialize_shebang_patterns(&mut self) {
403 let patterns = vec![
404 ("python", Language::Python),
405 ("python3", Language::Python),
406 ("python2", Language::Python),
407 ("node", Language::JavaScript),
408 ("bash", Language::Bash),
409 ("sh", Language::Bash),
410 ("zsh", Language::Bash),
411 ("fish", Language::Bash),
412 ("ruby", Language::Ruby),
413 ("php", Language::PHP),
414 ("env python", Language::Python),
415 ("env node", Language::JavaScript),
416 ("env bash", Language::Bash),
417 ("env ruby", Language::Ruby),
418 ];
419
420 for (pattern, language) in patterns {
421 self.shebang_patterns.insert(pattern.to_string(), language);
422 }
423 }
424
425 fn initialize_content_signatures(&mut self) {
427 let python_sigs = vec![
429 ContentSignature {
430 language: Language::Python,
431 patterns: vec![
432 r"def\s+\w+\s*\(".to_string(),
433 r"import\s+\w+".to_string(),
434 r"from\s+\w+\s+import".to_string(),
435 r"class\s+\w+\s*\(".to_string(),
436 r"__\w+__".to_string(),
437 ],
438 weight: 0.9,
439 required_matches: 2,
440 }
441 ];
442 self.content_signatures.insert(Language::Python, python_sigs);
443
444 let js_sigs = vec![
446 ContentSignature {
447 language: Language::JavaScript,
448 patterns: vec![
449 r"function\s+\w+\s*\(".to_string(),
450 r"const\s+\w+\s*=".to_string(),
451 r"let\s+\w+\s*=".to_string(),
452 r"=>\s*\{".to_string(),
453 r"require\s*\(".to_string(),
454 r"console\.log\s*\(".to_string(),
455 ],
456 weight: 0.8,
457 required_matches: 2,
458 }
459 ];
460 self.content_signatures.insert(Language::JavaScript, js_sigs);
461
462 let rust_sigs = vec![
464 ContentSignature {
465 language: Language::Rust,
466 patterns: vec![
467 r"fn\s+\w+\s*\(".to_string(),
468 r"use\s+[\w:]+".to_string(),
469 r"struct\s+\w+".to_string(),
470 r"impl\s+[\w<>]+".to_string(),
471 r"let\s+mut\s+\w+".to_string(),
472 r"match\s+\w+\s*\{".to_string(),
473 ],
474 weight: 0.95,
475 required_matches: 2,
476 }
477 ];
478 self.content_signatures.insert(Language::Rust, rust_sigs);
479
480 }
482
483 fn initialize_ast_parsers(&mut self) {
485 for (language, ts_lang_fn) in TS_LANGUAGES.iter() {
486 let mut parser = Parser::new();
487 if parser.set_language(ts_lang_fn()).is_ok() {
488 self.ast_parsers.insert(language.clone(), parser);
489 }
490 }
491 }
492
493 fn initialize_syntax_analyzers(&mut self) {
495 let python_analyzer = SyntaxAnalyzer {
497 language: Language::Python,
498 keywords: vec![
499 "def".to_string(), "class".to_string(), "import".to_string(),
500 "from".to_string(), "if".to_string(), "elif".to_string(),
501 ],
502 structural_patterns: vec![
503 "function_definition".to_string(),
504 "class_definition".to_string(),
505 "import_statement".to_string(),
506 "import_from_statement".to_string(),
507 ],
508 confidence_weights: HashMap::from([
509 ("function_definition".to_string(), 0.9),
510 ("class_definition".to_string(), 0.9),
511 ("import_statement".to_string(), 0.8),
512 ]),
513 };
514 self.syntax_analyzers.insert(Language::Python, python_analyzer);
515
516 let js_analyzer = SyntaxAnalyzer {
518 language: Language::JavaScript,
519 keywords: vec![
520 "function".to_string(), "class".to_string(), "import".to_string(),
521 "const".to_string(), "let".to_string(), "var".to_string(),
522 ],
523 structural_patterns: vec![
524 "function_declaration".to_string(),
525 "class_declaration".to_string(),
526 "import_statement".to_string(),
527 "variable_declaration".to_string(),
528 ],
529 confidence_weights: HashMap::from([
530 ("function_declaration".to_string(), 0.9),
531 ("class_declaration".to_string(), 0.9),
532 ("import_statement".to_string(), 0.8),
533 ]),
534 };
535 self.syntax_analyzers.insert(Language::JavaScript, js_analyzer);
536
537 let rust_analyzer = SyntaxAnalyzer {
539 language: Language::Rust,
540 keywords: vec![
541 "fn".to_string(), "struct".to_string(), "enum".to_string(),
542 "impl".to_string(), "use".to_string(), "mod".to_string(),
543 ],
544 structural_patterns: vec![
545 "function_item".to_string(),
546 "struct_item".to_string(),
547 "enum_item".to_string(),
548 "use_declaration".to_string(),
549 ],
550 confidence_weights: HashMap::from([
551 ("function_item".to_string(), 0.9),
552 ("struct_item".to_string(), 0.9),
553 ("use_declaration".to_string(), 0.8),
554 ]),
555 };
556 self.syntax_analyzers.insert(Language::Rust, rust_analyzer);
557 }
558
559 fn detect_by_extension(&self, path: &Path) -> Language {
561 if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
562 if let Some(languages) = self.extension_map.get(&extension.to_lowercase()) {
563 return languages[0].0.clone();
565 }
566 }
567
568 Language::Unknown
569 }
570
571 fn detect_by_extension_and_filename(&self, path: &Path) -> Language {
573 if let Some(filename) = path.file_name().and_then(|name| name.to_str()) {
575 if let Some(language) = self.filename_patterns.get(filename) {
576 return language.clone();
577 }
578 }
579
580 self.detect_by_extension(path)
582 }
583
584 fn detect_with_content_analysis(&mut self, path: &Path, content: &str) -> DetectionResult {
586 let mut candidates = Vec::new();
587 let mut evidence = Vec::new();
588
589 let extension_lang = self.detect_by_extension_and_filename(path);
591 if extension_lang != Language::Unknown {
592 candidates.push((extension_lang.clone(), 0.7));
593 evidence.push(DetectionEvidence {
594 evidence_type: EvidenceType::Extension,
595 description: format!("File extension suggests: {:?}", extension_lang),
596 weight: 0.7,
597 });
598 }
599
600 if let Some(shebang_lang) = self.detect_by_shebang(content) {
602 candidates.push((shebang_lang.clone(), 0.95));
603 evidence.push(DetectionEvidence {
604 evidence_type: EvidenceType::Shebang,
605 description: format!("Shebang indicates: {:?}", shebang_lang),
606 weight: 0.95,
607 });
608 }
609
610 let signature_results = self.analyze_content_signatures(content);
612 for (lang, confidence) in signature_results {
613 candidates.push((lang.clone(), confidence));
614 evidence.push(DetectionEvidence {
615 evidence_type: EvidenceType::Syntax,
616 description: format!("Content signatures match: {:?}", lang),
617 weight: confidence,
618 });
619 }
620
621 let import_results = self.analyze_import_patterns(content);
623 for (lang, confidence) in import_results {
624 candidates.push((lang.clone(), confidence));
625 evidence.push(DetectionEvidence {
626 evidence_type: EvidenceType::Import,
627 description: format!("Import patterns match: {:?}", lang),
628 weight: confidence,
629 });
630 }
631
632 self.aggregate_detection_results(candidates, evidence)
634 }
635
636 fn detect_with_full_analysis(&mut self, path: &Path, content: &str) -> DetectionResult {
638 let mut base_result = self.detect_with_content_analysis(path, content);
639
640 let statistical_results = self.statistical_analysis(content);
642 for (lang, confidence) in statistical_results {
643 base_result.alternatives.push((lang, confidence));
644 }
645
646 base_result.alternatives.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
648
649 base_result
650 }
651
652 fn detect_with_custom_rules(&mut self, path: &Path, content: &str, rules: &CustomDetectionRules) -> DetectionResult {
654 let mut candidates = Vec::new();
655 let mut evidence = Vec::new();
656
657 if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
659 if let Some(language) = rules.extension_overrides.get(&extension.to_lowercase()) {
660 candidates.push((language.clone(), 1.0));
661 evidence.push(DetectionEvidence {
662 evidence_type: EvidenceType::Extension,
663 description: format!("Custom extension rule: {} -> {:?}", extension, language),
664 weight: 1.0,
665 });
666 }
667 }
668
669 if let Some(filename) = path.file_name().and_then(|name| name.to_str()) {
671 if let Some(language) = rules.filename_patterns.get(filename) {
672 candidates.push((language.clone(), 1.0));
673 evidence.push(DetectionEvidence {
674 evidence_type: EvidenceType::Filename,
675 description: format!("Custom filename rule: {} -> {:?}", filename, language),
676 weight: 1.0,
677 });
678 }
679 }
680
681 for signature in &rules.content_signatures {
683 let matches = signature.patterns.iter()
684 .map(|pattern| {
685 content.matches(pattern).count()
687 })
688 .sum::<usize>();
689
690 if matches >= signature.required_matches {
691 candidates.push((signature.language.clone(), signature.weight));
692 evidence.push(DetectionEvidence {
693 evidence_type: EvidenceType::Syntax,
694 description: format!("Custom signature matches for {:?}: {}", signature.language, matches),
695 weight: signature.weight,
696 });
697 }
698 }
699
700 if candidates.is_empty() {
702 return self.detect_with_content_analysis(path, content);
703 }
704
705 self.aggregate_detection_results(candidates, evidence)
706 }
707
708 fn detect_by_shebang(&self, content: &str) -> Option<Language> {
710 let lines: Vec<&str> = content.lines().collect();
711 if lines.is_empty() {
712 return None;
713 }
714
715 let first_line = lines[0];
716 if first_line.starts_with("#!") {
717 let shebang_path = &first_line[2..].trim();
718
719 for (pattern, language) in &self.shebang_patterns {
720 if shebang_path.contains(pattern) {
721 return Some(language.clone());
722 }
723 }
724 }
725
726 None
727 }
728
729 fn analyze_content_signatures(&self, content: &str) -> Vec<(Language, f32)> {
731 let mut results = Vec::new();
732
733 for (language, signatures) in &self.content_signatures {
734 for signature in signatures {
735 let matches = signature.patterns.iter()
736 .map(|pattern| {
737 match Regex::new(pattern) {
739 Ok(regex) => regex.find_iter(content).count(),
740 Err(_) => {
741 content.matches(pattern).count()
743 }
744 }
745 })
746 .sum::<usize>();
747
748 if matches >= signature.required_matches {
749 let confidence = (matches as f32 / signature.patterns.len() as f32) * signature.weight;
750 results.push((language.clone(), confidence));
751 }
752 }
753 }
754
755 results
756 }
757
758 fn analyze_import_patterns(&mut self, content: &str) -> Vec<(Language, f32)> {
760 let mut results = Vec::new();
761 let mut trees: Vec<(Language, tree_sitter::Tree)> = Vec::new();
762
763 for (language, parser) in &mut self.ast_parsers {
765 if let Some(tree) = parser.parse(content, None) {
766 trees.push((language.clone(), tree));
767 }
768 }
769
770 for (language, tree) in trees {
772 let root_node = tree.root_node();
773 let import_count = self.count_import_nodes(&root_node, &language);
774
775 if import_count > 0 {
776 let confidence = (import_count as f32 / 10.0).min(0.9);
778 results.push((language, confidence));
779 }
780 }
781
782 results
783 }
784
785 fn statistical_analysis(&mut self, content: &str) -> Vec<(Language, f32)> {
787 let mut results = Vec::new();
788 let mut analysis_data: Vec<(Language, tree_sitter::Tree, SyntaxAnalyzer)> = Vec::new();
789
790 for (language, analyzer) in &self.syntax_analyzers {
792 if let Some(parser) = self.ast_parsers.get_mut(language) {
793 if let Some(tree) = parser.parse(content, None) {
794 analysis_data.push((language.clone(), tree, analyzer.clone()));
795 }
796 }
797 }
798
799 for (language, tree, analyzer) in analysis_data {
801 let root_node = tree.root_node();
802 let structural_score = self.calculate_structural_score(&root_node, &analyzer);
803
804 if structural_score > 0.0 {
805 results.push((language, structural_score));
806 }
807 }
808
809 results
810 }
811
812 fn count_import_nodes(&self, node: &Node, language: &Language) -> usize {
814 let mut count = 0;
815 let import_types: &[&str] = match language {
816 Language::Python => &["import_statement", "import_from_statement"],
817 Language::JavaScript | Language::TypeScript => &["import_statement", "import_declaration"],
818 Language::Rust => &["use_declaration"],
819 Language::Go => &["import_spec", "import_declaration"],
820 Language::Java => &["import_declaration"],
821 _ => &[],
822 };
823
824 self.count_nodes_recursive(node, import_types, &mut count);
825 count
826 }
827
828 fn calculate_structural_score(&self, node: &Node, analyzer: &SyntaxAnalyzer) -> f32 {
830 let mut score = 0.0;
831
832 for pattern in &analyzer.structural_patterns {
833 let count = self.count_specific_nodes(node, pattern);
834 if count > 0 {
835 let weight = analyzer.confidence_weights.get(pattern).unwrap_or(&0.5);
836 score += (count as f32) * weight;
837 }
838 }
839
840 (score / 10.0).min(1.0)
842 }
843
844 fn count_nodes_recursive(&self, node: &Node, target_types: &[&str], count: &mut usize) {
846 if target_types.contains(&node.kind()) {
847 *count += 1;
848 }
849
850 for i in 0..node.child_count() {
851 if let Some(child) = node.child(i) {
852 self.count_nodes_recursive(&child, target_types, count);
853 }
854 }
855 }
856
857 fn count_specific_nodes(&self, node: &Node, target_type: &str) -> usize {
859 let mut count = 0;
860 self.count_nodes_recursive(node, &[target_type], &mut count);
861 count
862 }
863
864 fn aggregate_detection_results(&self, candidates: Vec<(Language, f32)>, evidence: Vec<DetectionEvidence>) -> DetectionResult {
866 if candidates.is_empty() {
867 return DetectionResult {
868 language: Language::Unknown,
869 confidence: 0.0,
870 detection_method: DetectionMethod::FileExtension,
871 alternatives: vec![],
872 evidence,
873 };
874 }
875
876 let mut language_scores: HashMap<Language, f32> = HashMap::new();
878 let mut methods_used: Vec<DetectionMethod> = Vec::new();
879
880 for (lang, confidence) in &candidates {
881 *language_scores.entry(lang.clone()).or_insert(0.0) += confidence;
882 }
883
884 let (best_language, best_confidence) = language_scores.iter()
886 .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
887 .map(|(lang, conf)| (lang.clone(), *conf))
888 .unwrap_or((Language::Unknown, 0.0));
889
890 let normalized_confidence = best_confidence.min(1.0);
892
893 let mut alternatives: Vec<(Language, f32)> = language_scores
895 .into_iter()
896 .filter(|(lang, _)| *lang != best_language)
897 .collect();
898 alternatives.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
899
900 let detection_method = if evidence.iter().any(|e| e.evidence_type == EvidenceType::Shebang) {
902 DetectionMethod::Shebang
903 } else if evidence.iter().any(|e| e.evidence_type == EvidenceType::Syntax) {
904 DetectionMethod::ContentSignature
905 } else if evidence.iter().any(|e| e.evidence_type == EvidenceType::Extension) {
906 DetectionMethod::FileExtension
907 } else {
908 DetectionMethod::Hybrid
909 };
910
911 DetectionResult {
912 language: best_language,
913 confidence: normalized_confidence,
914 detection_method,
915 alternatives,
916 evidence,
917 }
918 }
919
920 fn apply_project_type_bias(&self, mut result: DetectionResult, project_type: &ProjectType) -> DetectionResult {
922 let bias_factor = 0.25;
923
924 match project_type {
925 ProjectType::WebFrontend => {
926 if matches!(result.language, Language::JavaScript | Language::TypeScript | Language::HTML | Language::CSS) {
927 result.confidence += bias_factor;
928 }
929 }
930 ProjectType::WebBackend => {
931 if matches!(result.language, Language::Python | Language::JavaScript | Language::TypeScript | Language::Java | Language::Go | Language::Rust) {
932 result.confidence += bias_factor;
933 }
934 }
935 ProjectType::SystemsProgram => {
936 if matches!(result.language, Language::Rust | Language::C | Language::Cpp | Language::Go) {
937 result.confidence += bias_factor;
938 }
939 }
940 ProjectType::DataScience => {
941 if matches!(result.language, Language::Python | Language::R | Language::SQL) {
942 result.confidence += bias_factor;
943 }
944 }
945 _ => {}
946 }
947
948 result.confidence = result.confidence.min(1.0);
949 result
950 }
951
952 fn apply_dominant_language_bias(&self, mut result: DetectionResult, dominant_languages: &[Language]) -> DetectionResult {
954 if dominant_languages.contains(&result.language) {
955 result.confidence += 0.15;
956 result.confidence = result.confidence.min(1.0);
957 }
958 result
959 }
960
961 fn apply_framework_bias(&self, mut result: DetectionResult, framework_indicators: &[String]) -> DetectionResult {
963 for indicator in framework_indicators {
966 match indicator.as_str() {
967 "package.json" | "node_modules" => {
968 if matches!(result.language, Language::JavaScript | Language::TypeScript) {
969 result.confidence += 0.1;
970 }
971 }
972 "Cargo.toml" | "Cargo.lock" => {
973 if result.language == Language::Rust {
974 result.confidence += 0.1;
975 }
976 }
977 "requirements.txt" | "__pycache__" | ".pyc" => {
978 if result.language == Language::Python {
979 result.confidence += 0.1;
980 }
981 }
982 _ => {}
983 }
984 }
985
986 result.confidence = result.confidence.min(1.0);
987 result
988 }
989}
990
991impl Default for LanguageDetector {
992 fn default() -> Self {
993 Self::new()
994 }
995}
996
997#[cfg(test)]
998mod tests {
999 use super::*;
1000 use std::path::PathBuf;
1001
1002 #[test]
1003 fn test_extension_detection() {
1004 let mut detector = LanguageDetector::new();
1005
1006 assert_eq!(detector.detect_language(Path::new("test.rs")), Language::Rust);
1007 assert_eq!(detector.detect_language(Path::new("test.py")), Language::Python);
1008 assert_eq!(detector.detect_language(Path::new("test.js")), Language::JavaScript);
1009 assert_eq!(detector.detect_language(Path::new("test.ts")), Language::TypeScript);
1010 assert_eq!(detector.detect_language(Path::new("test.java")), Language::Java);
1011 assert_eq!(detector.detect_language(Path::new("test.go")), Language::Go);
1012 assert_eq!(detector.detect_language(Path::new("test.cpp")), Language::Cpp);
1013 assert_eq!(detector.detect_language(Path::new("test.c")), Language::C);
1014 }
1015
1016 #[test]
1017 fn test_filename_patterns() {
1018 let mut detector = LanguageDetector::new();
1019
1020 assert_eq!(detector.detect_language(Path::new("Makefile")), Language::Unknown);
1021 assert_eq!(detector.detect_language(Path::new("Dockerfile")), Language::Unknown);
1022 assert_eq!(detector.detect_language(Path::new("Cargo.toml")), Language::TOML);
1023 assert_eq!(detector.detect_language(Path::new("package.json")), Language::JSON);
1024 }
1025
1026 #[test]
1027 fn test_shebang_detection() {
1028 let mut detector = LanguageDetector::new();
1029
1030 let python_script = "#!/usr/bin/env python3\nprint('Hello, world!')";
1031 let result = detector.detect_language_with_content(Path::new("script"), python_script);
1032 assert_eq!(result.language, Language::Python);
1033 assert!(result.confidence > 0.9);
1034 assert_eq!(result.detection_method, DetectionMethod::Shebang);
1035
1036 let bash_script = "#!/bin/bash\necho 'Hello, world!'";
1037 let result = detector.detect_language_with_content(Path::new("script"), bash_script);
1038 assert_eq!(result.language, Language::Bash);
1039 assert!(result.confidence > 0.9);
1040 }
1041
1042 #[test]
1043 fn test_content_signature_detection() {
1044 let mut detector = LanguageDetector::new();
1045
1046 let python_code = r#"
1047def hello_world():
1048 print("Hello, world!")
1049
1050class MyClass:
1051 def __init__(self):
1052 pass
1053
1054import sys
1055from collections import defaultdict
1056 "#;
1057
1058 let result = detector.detect_language_with_content(Path::new("unknown"), python_code);
1059 assert_eq!(result.language, Language::Python);
1060 assert!(result.confidence > 0.5);
1061
1062 let rust_code = r#"
1063fn main() {
1064 println!("Hello, world!");
1065}
1066
1067struct MyStruct {
1068 field: i32,
1069}
1070
1071impl MyStruct {
1072 fn new() -> Self {
1073 MyStruct { field: 0 }
1074 }
1075}
1076
1077use std::collections::HashMap;
1078 "#;
1079
1080 let result = detector.detect_language_with_content(Path::new("unknown"), rust_code);
1081 assert_eq!(result.language, Language::Rust);
1082 assert!(result.confidence > 0.5);
1083 }
1084
1085 #[test]
1086 fn test_import_pattern_detection() {
1087 let mut detector = LanguageDetector::new();
1088
1089 let js_code = r#"
1090import React from 'react';
1091import { useState } from 'react';
1092const fs = require('fs');
1093 "#;
1094
1095 let result = detector.detect_language_with_content(Path::new("unknown"), js_code);
1096 assert_eq!(result.language, Language::JavaScript);
1097
1098 let python_code = r#"
1099import os
1100import sys
1101from collections import defaultdict, Counter
1102 "#;
1103
1104 let result = detector.detect_language_with_content(Path::new("unknown"), python_code);
1105 assert_eq!(result.language, Language::Python);
1106 }
1107
1108 #[test]
1109 fn test_hybrid_detection() {
1110 let mut detector = LanguageDetector::new();
1111
1112 let python_code = "def hello():\n import sys\n print('Hello')";
1114 let result = detector.detect_language_with_content(Path::new("test.py"), python_code);
1115 assert_eq!(result.language, Language::Python);
1116 assert!(result.confidence > 0.6); assert!(result.evidence.len() > 1);
1118
1119 let python_code = "def hello(): print('Hello')";
1121 let result = detector.detect_language_with_content(Path::new("test.js"), python_code);
1122 assert!(result.language == Language::Python || result.language == Language::JavaScript);
1125 }
1126
1127 #[test]
1128 fn test_detection_with_hints() {
1129 let mut detector = LanguageDetector::new();
1130
1131 let hints = LanguageHints {
1132 project_type: Some(ProjectType::WebFrontend),
1133 dominant_languages: vec![Language::TypeScript],
1134 framework_indicators: vec!["package.json".to_string()],
1135 ..Default::default()
1136 };
1137
1138 let ts_code = "const hello = () => console.log('Hello');";
1139 let result = detector.detect_with_hints(Path::new("unknown"), ts_code, &hints);
1140
1141 assert_eq!(result.language, Language::JavaScript); assert!(result.confidence > 0.5);
1144 }
1145
1146 #[test]
1147 fn test_custom_detection_rules() {
1148 let mut custom_rules = CustomDetectionRules {
1149 extension_overrides: HashMap::new(),
1150 filename_patterns: HashMap::new(),
1151 content_signatures: vec![],
1152 priority_languages: vec![],
1153 };
1154
1155 custom_rules.extension_overrides.insert("myext".to_string(), Language::Rust);
1157
1158 let mut detector = LanguageDetector::with_strategy(DetectionStrategy::Custom(custom_rules));
1159
1160 let result = detector.detect_language_with_content(Path::new("test.myext"), "some content");
1161 assert_eq!(result.language, Language::Rust);
1162 assert_eq!(result.confidence, 1.0);
1163 }
1164
1165 #[test]
1166 fn test_detection_evidence() {
1167 let mut detector = LanguageDetector::new();
1168
1169 let python_code = "#!/usr/bin/env python\ndef hello(): print('Hello')";
1170 let result = detector.detect_language_with_content(Path::new("test.py"), python_code);
1171
1172 assert!(result.evidence.len() >= 2);
1174 assert!(result.evidence.iter().any(|e| e.evidence_type == EvidenceType::Shebang));
1175 assert!(result.evidence.iter().any(|e| e.evidence_type == EvidenceType::Extension));
1176 }
1177
1178 #[test]
1179 fn test_confidence_scoring() {
1180 let mut detector = LanguageDetector::new();
1181
1182 let strong_python = "#!/usr/bin/env python3\nimport os\ndef main(): pass\nclass Test: pass";
1184 let result = detector.detect_language_with_content(Path::new("test.py"), strong_python);
1185 assert!(result.confidence > 0.8);
1186
1187 let weak_indicators = "hello world";
1189 let result = detector.detect_language_with_content(Path::new("test.py"), weak_indicators);
1190 assert!(result.confidence < 0.8);
1191 }
1192
1193 #[test]
1194 fn test_alternatives_ranking() {
1195 let mut detector = LanguageDetector::new();
1196
1197 let ambiguous_code = "print hello"; let result = detector.detect_language_with_content(Path::new("unknown"), ambiguous_code);
1199
1200 if result.alternatives.len() > 1 {
1202 assert!(result.alternatives[0].1 >= result.alternatives[1].1);
1203 }
1204 }
1205}