scribe_scanner/
language_detection.rs

1//! Advanced programming language detection for 25+ languages.
2//!
3//! This module provides sophisticated language detection capabilities using:
4//! - File extension analysis with priority mapping
5//! - Content-based detection using language signatures
6//! - Shebang line analysis for scripts
7//! - Filename pattern matching (e.g., Makefile, Dockerfile)
8//! - Statistical content analysis for ambiguous cases
9
10use scribe_core::{Language, Result};
11use std::path::Path;
12use std::collections::HashMap;
13use once_cell::sync::Lazy;
14use serde::{Serialize, Deserialize};
15use tree_sitter::{Parser, Language as TsLanguage, Node};
16use regex::Regex;
17
18/// Language detection strategy configuration
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub enum DetectionStrategy {
21    /// Extension-only detection (fastest)
22    ExtensionOnly,
23    /// Extension + content analysis (default)
24    ExtensionWithContent,
25    /// Full analysis including statistical detection (most accurate)
26    FullAnalysis,
27    /// Custom detection with user-defined rules
28    Custom(CustomDetectionRules),
29}
30
31/// Custom detection rules for specialized cases
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct CustomDetectionRules {
34    pub extension_overrides: HashMap<String, Language>,
35    pub filename_patterns: HashMap<String, Language>,
36    pub content_signatures: Vec<ContentSignature>,
37    pub priority_languages: Vec<Language>,
38}
39
40/// Content signature for language detection
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct ContentSignature {
43    pub language: Language,
44    pub patterns: Vec<String>,
45    pub weight: f32,
46    pub required_matches: usize,
47}
48
49/// Language detection hints for improved accuracy
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct LanguageHints {
52    pub project_type: Option<ProjectType>,
53    pub build_files: Vec<String>,
54    pub directory_structure: Vec<String>,
55    pub dominant_languages: Vec<Language>,
56    pub framework_indicators: Vec<String>,
57}
58
59/// Project type classification
60#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
61pub enum ProjectType {
62    WebFrontend,
63    WebBackend,
64    MobileApp,
65    DesktopApp,
66    SystemsProgram,
67    DataScience,
68    GameDevelopment,
69    EmbeddedSystem,
70    Library,
71    Documentation,
72    Configuration,
73    Unknown,
74}
75
76/// Language detection results with confidence scores
77#[derive(Debug, Clone, Serialize, Deserialize)]
78pub struct DetectionResult {
79    pub language: Language,
80    pub confidence: f32,
81    pub detection_method: DetectionMethod,
82    pub alternatives: Vec<(Language, f32)>,
83    pub evidence: Vec<DetectionEvidence>,
84}
85
86/// Method used for language detection
87#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
88pub enum DetectionMethod {
89    FileExtension,
90    Filename,
91    Shebang,
92    ContentSignature,
93    StatisticalAnalysis,
94    Hybrid,
95}
96
97/// Evidence supporting language detection
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct DetectionEvidence {
100    pub evidence_type: EvidenceType,
101    pub description: String,
102    pub weight: f32,
103}
104
105/// Type of detection evidence
106#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
107pub enum EvidenceType {
108    Extension,
109    Filename,
110    Shebang,
111    Keyword,
112    Syntax,
113    Import,
114    Framework,
115    BuildSystem,
116}
117
118/// High-performance language detector with multiple strategies
119pub struct LanguageDetector {
120    strategy: DetectionStrategy,
121    extension_map: HashMap<String, Vec<(Language, f32)>>, // extension -> (language, confidence)
122    filename_patterns: HashMap<String, Language>,
123    content_signatures: HashMap<Language, Vec<ContentSignature>>,
124    shebang_patterns: HashMap<String, Language>,
125    ast_parsers: HashMap<Language, Parser>,
126    syntax_analyzers: HashMap<Language, SyntaxAnalyzer>,
127}
128
129/// AST-based syntax analyzer for content analysis
130#[derive(Debug, Clone)]
131struct SyntaxAnalyzer {
132    language: Language,
133    keywords: Vec<String>,
134    structural_patterns: Vec<String>, // AST node types to look for
135    confidence_weights: HashMap<String, f32>,
136}
137
138// Tree-sitter language mapping for AST analysis
139static TS_LANGUAGES: Lazy<HashMap<Language, fn() -> TsLanguage>> = Lazy::new(|| {
140    let mut languages = HashMap::new();
141    languages.insert(Language::Python, tree_sitter_python::language as fn() -> TsLanguage);
142    languages.insert(Language::JavaScript, tree_sitter_javascript::language as fn() -> TsLanguage);
143    languages.insert(Language::TypeScript, tree_sitter_typescript::language_typescript as fn() -> TsLanguage);
144    languages.insert(Language::Rust, tree_sitter_rust::language as fn() -> TsLanguage);
145    languages.insert(Language::Go, tree_sitter_go::language as fn() -> TsLanguage);
146    languages
147});
148
149impl Default for DetectionStrategy {
150    fn default() -> Self {
151        DetectionStrategy::ExtensionWithContent
152    }
153}
154
155impl Default for LanguageHints {
156    fn default() -> Self {
157        Self {
158            project_type: None,
159            build_files: Vec::new(),
160            directory_structure: Vec::new(),
161            dominant_languages: Vec::new(),
162            framework_indicators: Vec::new(),
163        }
164    }
165}
166
167impl LanguageDetector {
168    /// Create a new language detector with default configuration
169    pub fn new() -> Self {
170        let mut detector = Self {
171            strategy: DetectionStrategy::default(),
172            extension_map: HashMap::new(),
173            filename_patterns: HashMap::new(),
174            content_signatures: HashMap::new(),
175            shebang_patterns: HashMap::new(),
176            ast_parsers: HashMap::new(),
177            syntax_analyzers: HashMap::new(),
178        };
179        
180        detector.initialize_detection_rules();
181        detector
182    }
183
184    /// Create a language detector with custom strategy
185    pub fn with_strategy(strategy: DetectionStrategy) -> Self {
186        let mut detector = Self::new();
187        detector.strategy = strategy;
188        detector
189    }
190
191    /// Detect language for a file path (extension-based)
192    pub fn detect_language(&self, path: &Path) -> Language {
193        match self.strategy {
194            DetectionStrategy::ExtensionOnly => {
195                self.detect_by_extension(path)
196            }
197            _ => {
198                // For more complex strategies, we'd need file content
199                // This is a fallback for when only path is available
200                self.detect_by_extension_and_filename(path)
201            }
202        }
203    }
204
205    /// Detect language with full content analysis
206    pub fn detect_language_with_content(&mut self, path: &Path, content: &str) -> DetectionResult {
207        match self.strategy {
208            DetectionStrategy::ExtensionOnly => {
209                let language = self.detect_by_extension(path);
210                DetectionResult {
211                    language: language.clone(),
212                    confidence: if language == Language::Unknown { 0.1 } else { 0.9 },
213                    detection_method: DetectionMethod::FileExtension,
214                    alternatives: vec![],
215                    evidence: vec![DetectionEvidence {
216                        evidence_type: EvidenceType::Extension,
217                        description: format!("File extension: {:?}", path.extension()),
218                        weight: 0.9,
219                    }],
220                }
221            }
222            DetectionStrategy::ExtensionWithContent => {
223                self.detect_with_content_analysis(path, content)
224            }
225            DetectionStrategy::FullAnalysis => {
226                self.detect_with_full_analysis(path, content)
227            }
228            DetectionStrategy::Custom(ref rules) => {
229                let rules = rules.clone();
230                self.detect_with_custom_rules(path, content, &rules)
231            }
232        }
233    }
234
235    /// Detect language with project context hints
236    pub fn detect_with_hints(&mut self, path: &Path, content: &str, hints: &LanguageHints) -> DetectionResult {
237        let mut base_result = self.detect_language_with_content(path, content);
238        
239        // Apply hints to improve detection accuracy
240        if let Some(project_type) = &hints.project_type {
241            base_result = self.apply_project_type_bias(base_result, project_type);
242        }
243        
244        if !hints.dominant_languages.is_empty() {
245            base_result = self.apply_dominant_language_bias(base_result, &hints.dominant_languages);
246        }
247        
248        if !hints.framework_indicators.is_empty() {
249            base_result = self.apply_framework_bias(base_result, &hints.framework_indicators);
250        }
251        
252        base_result
253    }
254
255    /// Initialize all detection rules and patterns
256    fn initialize_detection_rules(&mut self) {
257        self.initialize_extension_map();
258        self.initialize_filename_patterns();
259        self.initialize_shebang_patterns();
260        self.initialize_content_signatures();
261        self.initialize_ast_parsers();
262        self.initialize_syntax_analyzers();
263    }
264
265    /// Initialize file extension to language mapping
266    fn initialize_extension_map(&mut self) {
267        let extensions = vec![
268            // Rust
269            ("rs", vec![(Language::Rust, 1.0)]),
270            
271            // Python
272            ("py", vec![(Language::Python, 0.95)]),
273            ("pyw", vec![(Language::Python, 1.0)]),
274            ("pyi", vec![(Language::Python, 1.0)]),
275            
276            // JavaScript/TypeScript
277            ("js", vec![(Language::JavaScript, 0.9)]),
278            ("jsx", vec![(Language::JavaScript, 1.0)]),
279            ("mjs", vec![(Language::JavaScript, 1.0)]),
280            ("ts", vec![(Language::TypeScript, 1.0)]),
281            ("tsx", vec![(Language::TypeScript, 1.0)]),
282            
283            // Java/Kotlin/Scala
284            ("java", vec![(Language::Java, 1.0)]),
285            ("kt", vec![(Language::Kotlin, 1.0)]),
286            ("kts", vec![(Language::Kotlin, 1.0)]),
287            ("scala", vec![(Language::Scala, 1.0)]),
288            ("sc", vec![(Language::Scala, 0.8)]),
289            
290            // C/C++
291            ("c", vec![(Language::C, 0.9)]),
292            ("h", vec![(Language::C, 0.7), (Language::Cpp, 0.3)]),
293            ("cpp", vec![(Language::Cpp, 1.0)]),
294            ("cxx", vec![(Language::Cpp, 1.0)]),
295            ("cc", vec![(Language::Cpp, 1.0)]),
296            ("hpp", vec![(Language::Cpp, 1.0)]),
297            ("hxx", vec![(Language::Cpp, 1.0)]),
298            
299            // C#
300            ("cs", vec![(Language::CSharp, 1.0)]),
301            
302            // Go
303            ("go", vec![(Language::Go, 1.0)]),
304            
305            // Ruby
306            ("rb", vec![(Language::Ruby, 1.0)]),
307            ("rbw", vec![(Language::Ruby, 1.0)]),
308            
309            // PHP
310            ("php", vec![(Language::PHP, 1.0)]),
311            ("phtml", vec![(Language::PHP, 1.0)]),
312            
313            // Swift
314            ("swift", vec![(Language::Swift, 1.0)]),
315            
316            // Dart
317            ("dart", vec![(Language::Dart, 1.0)]),
318            
319            // Shell scripts
320            ("sh", vec![(Language::Bash, 1.0)]),
321            ("bash", vec![(Language::Bash, 1.0)]),
322            ("zsh", vec![(Language::Bash, 1.0)]),
323            ("fish", vec![(Language::Bash, 1.0)]),
324            
325            // Web technologies
326            ("html", vec![(Language::HTML, 1.0)]),
327            ("htm", vec![(Language::HTML, 1.0)]),
328            ("css", vec![(Language::CSS, 1.0)]),
329            ("scss", vec![(Language::SCSS, 1.0)]),
330            ("sass", vec![(Language::SASS, 1.0)]),
331            
332            // Markup and data formats
333            ("md", vec![(Language::Markdown, 1.0)]),
334            ("markdown", vec![(Language::Markdown, 1.0)]),
335            ("xml", vec![(Language::XML, 1.0)]),
336            ("json", vec![(Language::JSON, 1.0)]),
337            ("yaml", vec![(Language::YAML, 1.0)]),
338            ("yml", vec![(Language::YAML, 1.0)]),
339            ("toml", vec![(Language::TOML, 1.0)]),
340            
341            // Configuration
342            ("ini", vec![(Language::Unknown, 1.0)]),
343            ("cfg", vec![(Language::Unknown, 0.8)]),
344            ("conf", vec![(Language::Unknown, 0.7)]),
345            
346            // SQL
347            ("sql", vec![(Language::SQL, 1.0)]),
348            
349            // Documentation
350            ("rst", vec![(Language::Unknown, 1.0)]),
351            ("tex", vec![(Language::Unknown, 1.0)]),
352            
353            // Other languages
354            ("r", vec![(Language::R, 1.0)]),
355            ("R", vec![(Language::R, 1.0)]),
356            ("m", vec![(Language::ObjectiveC, 0.6), (Language::Matlab, 0.4)]),
357            ("mm", vec![(Language::ObjectiveC, 1.0)]),
358            ("pl", vec![(Language::Unknown, 0.8)]),
359            ("pm", vec![(Language::Unknown, 1.0)]),
360            ("lua", vec![(Language::Unknown, 1.0)]),
361            ("vim", vec![(Language::Unknown, 1.0)]),
362            ("hs", vec![(Language::Haskell, 1.0)]),
363            ("lhs", vec![(Language::Haskell, 1.0)]),
364        ];
365
366        for (ext, languages) in extensions {
367            self.extension_map.insert(ext.to_string(), languages);
368        }
369    }
370
371    /// Initialize filename patterns for special files
372    fn initialize_filename_patterns(&mut self) {
373        let patterns = vec![
374            ("Makefile", Language::Unknown),
375            ("makefile", Language::Unknown),
376            ("Dockerfile", Language::Unknown),
377            ("dockerfile", Language::Unknown),
378            ("Cargo.toml", Language::TOML),
379            ("Cargo.lock", Language::TOML),
380            ("package.json", Language::JSON),
381            ("tsconfig.json", Language::JSON),
382            ("pyproject.toml", Language::TOML),
383            ("setup.py", Language::Python),
384            ("requirements.txt", Language::Unknown),
385            ("README", Language::Unknown),
386            ("LICENSE", Language::Unknown),
387            ("CHANGELOG", Language::Unknown),
388            ("CMakeLists.txt", Language::Unknown),
389            (".gitignore", Language::Unknown),
390            (".dockerignore", Language::Unknown),
391            ("Jenkinsfile", Language::Unknown),
392            ("build.gradle", Language::Unknown),
393            ("pom.xml", Language::XML),
394        ];
395
396        for (filename, language) in patterns {
397            self.filename_patterns.insert(filename.to_string(), language);
398        }
399    }
400
401    /// Initialize shebang patterns
402    fn initialize_shebang_patterns(&mut self) {
403        let patterns = vec![
404            ("python", Language::Python),
405            ("python3", Language::Python),
406            ("python2", Language::Python),
407            ("node", Language::JavaScript),
408            ("bash", Language::Bash),
409            ("sh", Language::Bash),
410            ("zsh", Language::Bash),
411            ("fish", Language::Bash),
412            ("ruby", Language::Ruby),
413            ("php", Language::PHP),
414            ("env python", Language::Python),
415            ("env node", Language::JavaScript),
416            ("env bash", Language::Bash),
417            ("env ruby", Language::Ruby),
418        ];
419
420        for (pattern, language) in patterns {
421            self.shebang_patterns.insert(pattern.to_string(), language);
422        }
423    }
424
425    /// Initialize content signatures for language detection
426    fn initialize_content_signatures(&mut self) {
427        // Python signatures
428        let python_sigs = vec![
429            ContentSignature {
430                language: Language::Python,
431                patterns: vec![
432                    r"def\s+\w+\s*\(".to_string(),
433                    r"import\s+\w+".to_string(),
434                    r"from\s+\w+\s+import".to_string(),
435                    r"class\s+\w+\s*\(".to_string(),
436                    r"__\w+__".to_string(),
437                ],
438                weight: 0.9,
439                required_matches: 2,
440            }
441        ];
442        self.content_signatures.insert(Language::Python, python_sigs);
443
444        // JavaScript signatures
445        let js_sigs = vec![
446            ContentSignature {
447                language: Language::JavaScript,
448                patterns: vec![
449                    r"function\s+\w+\s*\(".to_string(),
450                    r"const\s+\w+\s*=".to_string(),
451                    r"let\s+\w+\s*=".to_string(),
452                    r"=>\s*\{".to_string(),
453                    r"require\s*\(".to_string(),
454                    r"console\.log\s*\(".to_string(),
455                ],
456                weight: 0.8,
457                required_matches: 2,
458            }
459        ];
460        self.content_signatures.insert(Language::JavaScript, js_sigs);
461
462        // Rust signatures
463        let rust_sigs = vec![
464            ContentSignature {
465                language: Language::Rust,
466                patterns: vec![
467                    r"fn\s+\w+\s*\(".to_string(),
468                    r"use\s+[\w:]+".to_string(),
469                    r"struct\s+\w+".to_string(),
470                    r"impl\s+[\w<>]+".to_string(),
471                    r"let\s+mut\s+\w+".to_string(),
472                    r"match\s+\w+\s*\{".to_string(),
473                ],
474                weight: 0.95,
475                required_matches: 2,
476            }
477        ];
478        self.content_signatures.insert(Language::Rust, rust_sigs);
479
480        // Add more signatures for other languages...
481    }
482
483    /// Initialize AST parsers for content analysis
484    fn initialize_ast_parsers(&mut self) {
485        for (language, ts_lang_fn) in TS_LANGUAGES.iter() {
486            let mut parser = Parser::new();
487            if parser.set_language(ts_lang_fn()).is_ok() {
488                self.ast_parsers.insert(language.clone(), parser);
489            }
490        }
491    }
492
493    /// Initialize syntax analyzers for AST-based content analysis
494    fn initialize_syntax_analyzers(&mut self) {
495        // Python syntax analyzer
496        let python_analyzer = SyntaxAnalyzer {
497            language: Language::Python,
498            keywords: vec![
499                "def".to_string(), "class".to_string(), "import".to_string(),
500                "from".to_string(), "if".to_string(), "elif".to_string(),
501            ],
502            structural_patterns: vec![
503                "function_definition".to_string(),
504                "class_definition".to_string(),
505                "import_statement".to_string(),
506                "import_from_statement".to_string(),
507            ],
508            confidence_weights: HashMap::from([
509                ("function_definition".to_string(), 0.9),
510                ("class_definition".to_string(), 0.9),
511                ("import_statement".to_string(), 0.8),
512            ]),
513        };
514        self.syntax_analyzers.insert(Language::Python, python_analyzer);
515
516        // JavaScript/TypeScript syntax analyzer
517        let js_analyzer = SyntaxAnalyzer {
518            language: Language::JavaScript,
519            keywords: vec![
520                "function".to_string(), "class".to_string(), "import".to_string(),
521                "const".to_string(), "let".to_string(), "var".to_string(),
522            ],
523            structural_patterns: vec![
524                "function_declaration".to_string(),
525                "class_declaration".to_string(),
526                "import_statement".to_string(),
527                "variable_declaration".to_string(),
528            ],
529            confidence_weights: HashMap::from([
530                ("function_declaration".to_string(), 0.9),
531                ("class_declaration".to_string(), 0.9),
532                ("import_statement".to_string(), 0.8),
533            ]),
534        };
535        self.syntax_analyzers.insert(Language::JavaScript, js_analyzer);
536
537        // Rust syntax analyzer
538        let rust_analyzer = SyntaxAnalyzer {
539            language: Language::Rust,
540            keywords: vec![
541                "fn".to_string(), "struct".to_string(), "enum".to_string(),
542                "impl".to_string(), "use".to_string(), "mod".to_string(),
543            ],
544            structural_patterns: vec![
545                "function_item".to_string(),
546                "struct_item".to_string(),
547                "enum_item".to_string(),
548                "use_declaration".to_string(),
549            ],
550            confidence_weights: HashMap::from([
551                ("function_item".to_string(), 0.9),
552                ("struct_item".to_string(), 0.9),
553                ("use_declaration".to_string(), 0.8),
554            ]),
555        };
556        self.syntax_analyzers.insert(Language::Rust, rust_analyzer);
557    }
558
559    /// Detect language by extension only
560    fn detect_by_extension(&self, path: &Path) -> Language {
561        if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
562            if let Some(languages) = self.extension_map.get(&extension.to_lowercase()) {
563                // Return the language with highest confidence
564                return languages[0].0.clone();
565            }
566        }
567        
568        Language::Unknown
569    }
570
571    /// Detect language by extension and filename patterns
572    fn detect_by_extension_and_filename(&self, path: &Path) -> Language {
573        // Check filename patterns first
574        if let Some(filename) = path.file_name().and_then(|name| name.to_str()) {
575            if let Some(language) = self.filename_patterns.get(filename) {
576                return language.clone();
577            }
578        }
579        
580        // Fall back to extension
581        self.detect_by_extension(path)
582    }
583
584    /// Detect language with content analysis
585    fn detect_with_content_analysis(&mut self, path: &Path, content: &str) -> DetectionResult {
586        let mut candidates = Vec::new();
587        let mut evidence = Vec::new();
588
589        // Start with extension-based detection
590        let extension_lang = self.detect_by_extension_and_filename(path);
591        if extension_lang != Language::Unknown {
592            candidates.push((extension_lang.clone(), 0.7));
593            evidence.push(DetectionEvidence {
594                evidence_type: EvidenceType::Extension,
595                description: format!("File extension suggests: {:?}", extension_lang),
596                weight: 0.7,
597            });
598        }
599
600        // Check shebang
601        if let Some(shebang_lang) = self.detect_by_shebang(content) {
602            candidates.push((shebang_lang.clone(), 0.95));
603            evidence.push(DetectionEvidence {
604                evidence_type: EvidenceType::Shebang,
605                description: format!("Shebang indicates: {:?}", shebang_lang),
606                weight: 0.95,
607            });
608        }
609
610        // Check content signatures
611        let signature_results = self.analyze_content_signatures(content);
612        for (lang, confidence) in signature_results {
613            candidates.push((lang.clone(), confidence));
614            evidence.push(DetectionEvidence {
615                evidence_type: EvidenceType::Syntax,
616                description: format!("Content signatures match: {:?}", lang),
617                weight: confidence,
618            });
619        }
620
621        // Check import patterns
622        let import_results = self.analyze_import_patterns(content);
623        for (lang, confidence) in import_results {
624            candidates.push((lang.clone(), confidence));
625            evidence.push(DetectionEvidence {
626                evidence_type: EvidenceType::Import,
627                description: format!("Import patterns match: {:?}", lang),
628                weight: confidence,
629            });
630        }
631
632        // Aggregate results
633        self.aggregate_detection_results(candidates, evidence)
634    }
635
636    /// Detect language with full statistical analysis
637    fn detect_with_full_analysis(&mut self, path: &Path, content: &str) -> DetectionResult {
638        let mut base_result = self.detect_with_content_analysis(path, content);
639        
640        // Add statistical analysis
641        let statistical_results = self.statistical_analysis(content);
642        for (lang, confidence) in statistical_results {
643            base_result.alternatives.push((lang, confidence));
644        }
645
646        // Sort alternatives by confidence
647        base_result.alternatives.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
648        
649        base_result
650    }
651
652    /// Detect language with custom rules
653    fn detect_with_custom_rules(&mut self, path: &Path, content: &str, rules: &CustomDetectionRules) -> DetectionResult {
654        let mut candidates = Vec::new();
655        let mut evidence = Vec::new();
656
657        // Check custom extension overrides
658        if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
659            if let Some(language) = rules.extension_overrides.get(&extension.to_lowercase()) {
660                candidates.push((language.clone(), 1.0));
661                evidence.push(DetectionEvidence {
662                    evidence_type: EvidenceType::Extension,
663                    description: format!("Custom extension rule: {} -> {:?}", extension, language),
664                    weight: 1.0,
665                });
666            }
667        }
668
669        // Check custom filename patterns
670        if let Some(filename) = path.file_name().and_then(|name| name.to_str()) {
671            if let Some(language) = rules.filename_patterns.get(filename) {
672                candidates.push((language.clone(), 1.0));
673                evidence.push(DetectionEvidence {
674                    evidence_type: EvidenceType::Filename,
675                    description: format!("Custom filename rule: {} -> {:?}", filename, language),
676                    weight: 1.0,
677                });
678            }
679        }
680
681        // Check custom content signatures (using simple string matching instead of regex)
682        for signature in &rules.content_signatures {
683            let matches = signature.patterns.iter()
684                .map(|pattern| {
685                    // Use simple string matching for custom patterns
686                    content.matches(pattern).count()
687                })
688                .sum::<usize>();
689            
690            if matches >= signature.required_matches {
691                candidates.push((signature.language.clone(), signature.weight));
692                evidence.push(DetectionEvidence {
693                    evidence_type: EvidenceType::Syntax,
694                    description: format!("Custom signature matches for {:?}: {}", signature.language, matches),
695                    weight: signature.weight,
696                });
697            }
698        }
699
700        // Fall back to regular detection if no custom rules matched
701        if candidates.is_empty() {
702            return self.detect_with_content_analysis(path, content);
703        }
704
705        self.aggregate_detection_results(candidates, evidence)
706    }
707
708    /// Detect language from shebang line
709    fn detect_by_shebang(&self, content: &str) -> Option<Language> {
710        let lines: Vec<&str> = content.lines().collect();
711        if lines.is_empty() {
712            return None;
713        }
714
715        let first_line = lines[0];
716        if first_line.starts_with("#!") {
717            let shebang_path = &first_line[2..].trim();
718            
719            for (pattern, language) in &self.shebang_patterns {
720                if shebang_path.contains(pattern) {
721                    return Some(language.clone());
722                }
723            }
724        }
725
726        None
727    }
728
729    /// Analyze content signatures
730    fn analyze_content_signatures(&self, content: &str) -> Vec<(Language, f32)> {
731        let mut results = Vec::new();
732
733        for (language, signatures) in &self.content_signatures {
734            for signature in signatures {
735                let matches = signature.patterns.iter()
736                    .map(|pattern| {
737                        // Use regex matching for content signatures
738                        match Regex::new(pattern) {
739                            Ok(regex) => regex.find_iter(content).count(),
740                            Err(_) => {
741                                // Fallback to simple string matching if regex fails
742                                content.matches(pattern).count()
743                            }
744                        }
745                    })
746                    .sum::<usize>();
747
748                if matches >= signature.required_matches {
749                    let confidence = (matches as f32 / signature.patterns.len() as f32) * signature.weight;
750                    results.push((language.clone(), confidence));
751                }
752            }
753        }
754
755        results
756    }
757
758    /// Analyze import patterns using AST parsing
759    fn analyze_import_patterns(&mut self, content: &str) -> Vec<(Language, f32)> {
760        let mut results = Vec::new();
761        let mut trees: Vec<(Language, tree_sitter::Tree)> = Vec::new();
762
763        // First collect all parsed trees
764        for (language, parser) in &mut self.ast_parsers {
765            if let Some(tree) = parser.parse(content, None) {
766                trees.push((language.clone(), tree));
767            }
768        }
769
770        // Then analyze them without borrowing conflicts
771        for (language, tree) in trees {
772            let root_node = tree.root_node();
773            let import_count = self.count_import_nodes(&root_node, &language);
774            
775            if import_count > 0 {
776                // Higher confidence for more import statements
777                let confidence = (import_count as f32 / 10.0).min(0.9);
778                results.push((language, confidence));
779            }
780        }
781
782        results
783    }
784
785    /// Perform AST-based structural analysis of content
786    fn statistical_analysis(&mut self, content: &str) -> Vec<(Language, f32)> {
787        let mut results = Vec::new();
788        let mut analysis_data: Vec<(Language, tree_sitter::Tree, SyntaxAnalyzer)> = Vec::new();
789
790        // First collect all parsed trees and their analyzers
791        for (language, analyzer) in &self.syntax_analyzers {
792            if let Some(parser) = self.ast_parsers.get_mut(language) {
793                if let Some(tree) = parser.parse(content, None) {
794                    analysis_data.push((language.clone(), tree, analyzer.clone()));
795                }
796            }
797        }
798
799        // Then analyze them without borrowing conflicts
800        for (language, tree, analyzer) in analysis_data {
801            let root_node = tree.root_node();
802            let structural_score = self.calculate_structural_score(&root_node, &analyzer);
803            
804            if structural_score > 0.0 {
805                results.push((language, structural_score));
806            }
807        }
808
809        results
810    }
811
812    /// Count import-related AST nodes for a specific language
813    fn count_import_nodes(&self, node: &Node, language: &Language) -> usize {
814        let mut count = 0;
815        let import_types: &[&str] = match language {
816            Language::Python => &["import_statement", "import_from_statement"],
817            Language::JavaScript | Language::TypeScript => &["import_statement", "import_declaration"],
818            Language::Rust => &["use_declaration"],
819            Language::Go => &["import_spec", "import_declaration"],
820            Language::Java => &["import_declaration"],
821            _ => &[],
822        };
823
824        self.count_nodes_recursive(node, import_types, &mut count);
825        count
826    }
827
828    /// Calculate structural score based on AST node patterns
829    fn calculate_structural_score(&self, node: &Node, analyzer: &SyntaxAnalyzer) -> f32 {
830        let mut score = 0.0;
831        
832        for pattern in &analyzer.structural_patterns {
833            let count = self.count_specific_nodes(node, pattern);
834            if count > 0 {
835                let weight = analyzer.confidence_weights.get(pattern).unwrap_or(&0.5);
836                score += (count as f32) * weight;
837            }
838        }
839        
840        // Normalize score to [0, 1] range
841        (score / 10.0).min(1.0)
842    }
843
844    /// Recursively count nodes of specific types
845    fn count_nodes_recursive(&self, node: &Node, target_types: &[&str], count: &mut usize) {
846        if target_types.contains(&node.kind()) {
847            *count += 1;
848        }
849        
850        for i in 0..node.child_count() {
851            if let Some(child) = node.child(i) {
852                self.count_nodes_recursive(&child, target_types, count);
853            }
854        }
855    }
856
857    /// Count specific node types in AST
858    fn count_specific_nodes(&self, node: &Node, target_type: &str) -> usize {
859        let mut count = 0;
860        self.count_nodes_recursive(node, &[target_type], &mut count);
861        count
862    }
863
864    /// Aggregate detection results from multiple sources
865    fn aggregate_detection_results(&self, candidates: Vec<(Language, f32)>, evidence: Vec<DetectionEvidence>) -> DetectionResult {
866        if candidates.is_empty() {
867            return DetectionResult {
868                language: Language::Unknown,
869                confidence: 0.0,
870                detection_method: DetectionMethod::FileExtension,
871                alternatives: vec![],
872                evidence,
873            };
874        }
875
876        // Group by language and sum confidence scores
877        let mut language_scores: HashMap<Language, f32> = HashMap::new();
878        let mut methods_used: Vec<DetectionMethod> = Vec::new();
879
880        for (lang, confidence) in &candidates {
881            *language_scores.entry(lang.clone()).or_insert(0.0) += confidence;
882        }
883
884        // Find the language with highest aggregated confidence
885        let (best_language, best_confidence) = language_scores.iter()
886            .max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
887            .map(|(lang, conf)| (lang.clone(), *conf))
888            .unwrap_or((Language::Unknown, 0.0));
889
890        // Normalize confidence to [0, 1] range
891        let normalized_confidence = best_confidence.min(1.0);
892
893        // Create alternatives list
894        let mut alternatives: Vec<(Language, f32)> = language_scores
895            .into_iter()
896            .filter(|(lang, _)| *lang != best_language)
897            .collect();
898        alternatives.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
899
900        // Determine primary detection method
901        let detection_method = if evidence.iter().any(|e| e.evidence_type == EvidenceType::Shebang) {
902            DetectionMethod::Shebang
903        } else if evidence.iter().any(|e| e.evidence_type == EvidenceType::Syntax) {
904            DetectionMethod::ContentSignature
905        } else if evidence.iter().any(|e| e.evidence_type == EvidenceType::Extension) {
906            DetectionMethod::FileExtension
907        } else {
908            DetectionMethod::Hybrid
909        };
910
911        DetectionResult {
912            language: best_language,
913            confidence: normalized_confidence,
914            detection_method,
915            alternatives,
916            evidence,
917        }
918    }
919
920    /// Apply project type bias to detection results
921    fn apply_project_type_bias(&self, mut result: DetectionResult, project_type: &ProjectType) -> DetectionResult {
922        let bias_factor = 0.25;
923        
924        match project_type {
925            ProjectType::WebFrontend => {
926                if matches!(result.language, Language::JavaScript | Language::TypeScript | Language::HTML | Language::CSS) {
927                    result.confidence += bias_factor;
928                }
929            }
930            ProjectType::WebBackend => {
931                if matches!(result.language, Language::Python | Language::JavaScript | Language::TypeScript | Language::Java | Language::Go | Language::Rust) {
932                    result.confidence += bias_factor;
933                }
934            }
935            ProjectType::SystemsProgram => {
936                if matches!(result.language, Language::Rust | Language::C | Language::Cpp | Language::Go) {
937                    result.confidence += bias_factor;
938                }
939            }
940            ProjectType::DataScience => {
941                if matches!(result.language, Language::Python | Language::R | Language::SQL) {
942                    result.confidence += bias_factor;
943                }
944            }
945            _ => {}
946        }
947        
948        result.confidence = result.confidence.min(1.0);
949        result
950    }
951
952    /// Apply dominant language bias
953    fn apply_dominant_language_bias(&self, mut result: DetectionResult, dominant_languages: &[Language]) -> DetectionResult {
954        if dominant_languages.contains(&result.language) {
955            result.confidence += 0.15;
956            result.confidence = result.confidence.min(1.0);
957        }
958        result
959    }
960
961    /// Apply framework bias based on indicators
962    fn apply_framework_bias(&self, mut result: DetectionResult, framework_indicators: &[String]) -> DetectionResult {
963        // This would contain logic to bias detection based on framework files
964        // For example, presence of package.json suggests JavaScript/TypeScript
965        for indicator in framework_indicators {
966            match indicator.as_str() {
967                "package.json" | "node_modules" => {
968                    if matches!(result.language, Language::JavaScript | Language::TypeScript) {
969                        result.confidence += 0.1;
970                    }
971                }
972                "Cargo.toml" | "Cargo.lock" => {
973                    if result.language == Language::Rust {
974                        result.confidence += 0.1;
975                    }
976                }
977                "requirements.txt" | "__pycache__" | ".pyc" => {
978                    if result.language == Language::Python {
979                        result.confidence += 0.1;
980                    }
981                }
982                _ => {}
983            }
984        }
985        
986        result.confidence = result.confidence.min(1.0);
987        result
988    }
989}
990
991impl Default for LanguageDetector {
992    fn default() -> Self {
993        Self::new()
994    }
995}
996
997#[cfg(test)]
998mod tests {
999    use super::*;
1000    use std::path::PathBuf;
1001
1002    #[test]
1003    fn test_extension_detection() {
1004        let mut detector = LanguageDetector::new();
1005        
1006        assert_eq!(detector.detect_language(Path::new("test.rs")), Language::Rust);
1007        assert_eq!(detector.detect_language(Path::new("test.py")), Language::Python);
1008        assert_eq!(detector.detect_language(Path::new("test.js")), Language::JavaScript);
1009        assert_eq!(detector.detect_language(Path::new("test.ts")), Language::TypeScript);
1010        assert_eq!(detector.detect_language(Path::new("test.java")), Language::Java);
1011        assert_eq!(detector.detect_language(Path::new("test.go")), Language::Go);
1012        assert_eq!(detector.detect_language(Path::new("test.cpp")), Language::Cpp);
1013        assert_eq!(detector.detect_language(Path::new("test.c")), Language::C);
1014    }
1015
1016    #[test]
1017    fn test_filename_patterns() {
1018        let mut detector = LanguageDetector::new();
1019        
1020        assert_eq!(detector.detect_language(Path::new("Makefile")), Language::Unknown);
1021        assert_eq!(detector.detect_language(Path::new("Dockerfile")), Language::Unknown);
1022        assert_eq!(detector.detect_language(Path::new("Cargo.toml")), Language::TOML);
1023        assert_eq!(detector.detect_language(Path::new("package.json")), Language::JSON);
1024    }
1025
1026    #[test]
1027    fn test_shebang_detection() {
1028        let mut detector = LanguageDetector::new();
1029        
1030        let python_script = "#!/usr/bin/env python3\nprint('Hello, world!')";
1031        let result = detector.detect_language_with_content(Path::new("script"), python_script);
1032        assert_eq!(result.language, Language::Python);
1033        assert!(result.confidence > 0.9);
1034        assert_eq!(result.detection_method, DetectionMethod::Shebang);
1035        
1036        let bash_script = "#!/bin/bash\necho 'Hello, world!'";
1037        let result = detector.detect_language_with_content(Path::new("script"), bash_script);
1038        assert_eq!(result.language, Language::Bash);
1039        assert!(result.confidence > 0.9);
1040    }
1041
1042    #[test]
1043    fn test_content_signature_detection() {
1044        let mut detector = LanguageDetector::new();
1045        
1046        let python_code = r#"
1047def hello_world():
1048    print("Hello, world!")
1049    
1050class MyClass:
1051    def __init__(self):
1052        pass
1053        
1054import sys
1055from collections import defaultdict
1056        "#;
1057        
1058        let result = detector.detect_language_with_content(Path::new("unknown"), python_code);
1059        assert_eq!(result.language, Language::Python);
1060        assert!(result.confidence > 0.5);
1061        
1062        let rust_code = r#"
1063fn main() {
1064    println!("Hello, world!");
1065}
1066
1067struct MyStruct {
1068    field: i32,
1069}
1070
1071impl MyStruct {
1072    fn new() -> Self {
1073        MyStruct { field: 0 }
1074    }
1075}
1076
1077use std::collections::HashMap;
1078        "#;
1079        
1080        let result = detector.detect_language_with_content(Path::new("unknown"), rust_code);
1081        assert_eq!(result.language, Language::Rust);
1082        assert!(result.confidence > 0.5);
1083    }
1084
1085    #[test]
1086    fn test_import_pattern_detection() {
1087        let mut detector = LanguageDetector::new();
1088        
1089        let js_code = r#"
1090import React from 'react';
1091import { useState } from 'react';
1092const fs = require('fs');
1093        "#;
1094        
1095        let result = detector.detect_language_with_content(Path::new("unknown"), js_code);
1096        assert_eq!(result.language, Language::JavaScript);
1097        
1098        let python_code = r#"
1099import os
1100import sys
1101from collections import defaultdict, Counter
1102        "#;
1103        
1104        let result = detector.detect_language_with_content(Path::new("unknown"), python_code);
1105        assert_eq!(result.language, Language::Python);
1106    }
1107
1108    #[test]
1109    fn test_hybrid_detection() {
1110        let mut detector = LanguageDetector::new();
1111        
1112        // File with .py extension and Python content should have high confidence
1113        let python_code = "def hello():\n    import sys\n    print('Hello')";
1114        let result = detector.detect_language_with_content(Path::new("test.py"), python_code);
1115        assert_eq!(result.language, Language::Python);
1116        assert!(result.confidence > 0.6); // More realistic threshold  
1117        assert!(result.evidence.len() > 1);
1118        
1119        // File with conflicting extension and content
1120        let python_code = "def hello(): print('Hello')";
1121        let result = detector.detect_language_with_content(Path::new("test.js"), python_code);
1122        // Content analysis should work but may be overridden by strong extension match
1123        // This test may need adjustment based on detection strategy
1124        assert!(result.language == Language::Python || result.language == Language::JavaScript);
1125    }
1126
1127    #[test]
1128    fn test_detection_with_hints() {
1129        let mut detector = LanguageDetector::new();
1130        
1131        let hints = LanguageHints {
1132            project_type: Some(ProjectType::WebFrontend),
1133            dominant_languages: vec![Language::TypeScript],
1134            framework_indicators: vec!["package.json".to_string()],
1135            ..Default::default()
1136        };
1137        
1138        let ts_code = "const hello = () => console.log('Hello');";
1139        let result = detector.detect_with_hints(Path::new("unknown"), ts_code, &hints);
1140        
1141        // Should have higher confidence due to hints
1142        assert_eq!(result.language, Language::JavaScript); // or TypeScript depending on detection
1143        assert!(result.confidence > 0.5);
1144    }
1145
1146    #[test]
1147    fn test_custom_detection_rules() {
1148        let mut custom_rules = CustomDetectionRules {
1149            extension_overrides: HashMap::new(),
1150            filename_patterns: HashMap::new(),
1151            content_signatures: vec![],
1152            priority_languages: vec![],
1153        };
1154        
1155        // Add custom extension rule
1156        custom_rules.extension_overrides.insert("myext".to_string(), Language::Rust);
1157        
1158        let mut detector = LanguageDetector::with_strategy(DetectionStrategy::Custom(custom_rules));
1159        
1160        let result = detector.detect_language_with_content(Path::new("test.myext"), "some content");
1161        assert_eq!(result.language, Language::Rust);
1162        assert_eq!(result.confidence, 1.0);
1163    }
1164
1165    #[test]
1166    fn test_detection_evidence() {
1167        let mut detector = LanguageDetector::new();
1168        
1169        let python_code = "#!/usr/bin/env python\ndef hello(): print('Hello')";
1170        let result = detector.detect_language_with_content(Path::new("test.py"), python_code);
1171        
1172        // Should have multiple pieces of evidence
1173        assert!(result.evidence.len() >= 2);
1174        assert!(result.evidence.iter().any(|e| e.evidence_type == EvidenceType::Shebang));
1175        assert!(result.evidence.iter().any(|e| e.evidence_type == EvidenceType::Extension));
1176    }
1177
1178    #[test]
1179    fn test_confidence_scoring() {
1180        let mut detector = LanguageDetector::new();
1181        
1182        // Strong Python indicators should have high confidence
1183        let strong_python = "#!/usr/bin/env python3\nimport os\ndef main(): pass\nclass Test: pass";
1184        let result = detector.detect_language_with_content(Path::new("test.py"), strong_python);
1185        assert!(result.confidence > 0.8);
1186        
1187        // Weak indicators should have lower confidence
1188        let weak_indicators = "hello world";
1189        let result = detector.detect_language_with_content(Path::new("test.py"), weak_indicators);
1190        assert!(result.confidence < 0.8);
1191    }
1192
1193    #[test]
1194    fn test_alternatives_ranking() {
1195        let mut detector = LanguageDetector::new();
1196        
1197        let ambiguous_code = "print hello"; // Could be Python or other languages
1198        let result = detector.detect_language_with_content(Path::new("unknown"), ambiguous_code);
1199        
1200        // Should have alternatives sorted by confidence
1201        if result.alternatives.len() > 1 {
1202            assert!(result.alternatives[0].1 >= result.alternatives[1].1);
1203        }
1204    }
1205}