subx_cli/core/
language.rs

1//! Language detection module.
2//!
3//! Provides utilities to detect language codes from file paths and names,
4//! using directory names, filename patterns, and file extensions.
5//!
6//! # Examples
7//!
8//! ```rust
9//! use subx_cli::core::language::LanguageDetector;
10//! use std::path::Path;
11//!
12//! let detector = LanguageDetector::new();
13//! let code = detector.get_primary_language(Path::new("subtitle.sc.srt")).unwrap();
14//! assert_eq!(code, "sc");
15//! ```
16use regex::Regex;
17use std::collections::HashMap;
18use std::path::Path;
19
20/// Source of detected language information.
21#[derive(Debug, Clone, PartialEq)]
22pub enum LanguageSource {
23    /// Derived from a parent directory name.
24    Directory,
25    /// Derived from the file name pattern.
26    Filename,
27    /// Derived from the file extension or naming convention.
28    Extension,
29}
30impl Default for LanguageDetector {
31    fn default() -> Self {
32        Self::new()
33    }
34}
35
36/// Detected language information, including code, source, and confidence.
37#[derive(Debug, Clone)]
38pub struct LanguageInfo {
39    /// Standardized language code (e.g., "tc", "sc", "en").
40    pub code: String,
41    /// Origin of the language detection result.
42    pub source: LanguageSource,
43    /// Confidence score of the detection (0.0 to 1.0).
44    pub confidence: f32,
45}
46
47/// Detector for identifying language codes from filesystem paths.
48pub struct LanguageDetector {
49    language_codes: HashMap<String, String>,
50    directory_patterns: Vec<String>,
51    filename_patterns: Vec<Regex>,
52}
53
54impl LanguageDetector {
55    /// Create a new `LanguageDetector` with default language mappings and patterns.
56    ///
57    /// Initializes internal dictionaries and regex patterns for detection.
58    pub fn new() -> Self {
59        let mut language_codes = HashMap::new();
60        // Traditional Chinese
61        language_codes.insert("tc".to_string(), "tc".to_string());
62        language_codes.insert("繁中".to_string(), "tc".to_string());
63        language_codes.insert("繁體".to_string(), "tc".to_string());
64        language_codes.insert("cht".to_string(), "tc".to_string());
65        // Simplified Chinese
66        language_codes.insert("sc".to_string(), "sc".to_string());
67        language_codes.insert("簡中".to_string(), "sc".to_string());
68        language_codes.insert("簡體".to_string(), "sc".to_string());
69        language_codes.insert("chs".to_string(), "sc".to_string());
70        // English
71        language_codes.insert("en".to_string(), "en".to_string());
72        language_codes.insert("英文".to_string(), "en".to_string());
73        language_codes.insert("english".to_string(), "en".to_string());
74        // Additional languages (e.g., Japanese, Korean) can be added as needed.
75
76        let filename_patterns = vec![
77            Regex::new(r"\.([a-z]{2,3})\.").unwrap(), // .tc., .sc., .en.
78            Regex::new(r"_([a-z]{2,3})\.").unwrap(),  // _tc., _sc., _en.
79            Regex::new(r"-([a-z]{2,3})\.").unwrap(),  // -tc., -sc., -en.
80        ];
81
82        Self {
83            language_codes,
84            directory_patterns: vec!["tc".to_string(), "sc".to_string(), "en".to_string()],
85            filename_patterns,
86        }
87    }
88    /// Detect a single language information from the given path.
89    ///
90    /// # Behavior
91    ///
92    /// Attempts detection by directory name first, then by filename pattern.
93    pub fn detect_from_path(&self, path: &Path) -> Option<LanguageInfo> {
94        if let Some(lang) = self.detect_from_directory(path) {
95            return Some(lang);
96        }
97        if let Some(lang) = self.detect_from_filename(path) {
98            return Some(lang);
99        }
100        None
101    }
102
103    /// Return the primary detected language code for the provided path.
104    ///
105    /// # Returns
106    ///
107    /// `Some(code)` if detected, otherwise `None`.
108    pub fn get_primary_language(&self, path: &Path) -> Option<String> {
109        self.detect_all_languages(path)
110            .into_iter()
111            .next()
112            .map(|lang| lang.code)
113    }
114
115    /// Collect all potential language detections from the path.
116    ///
117    /// Sorts results by confidence and removes duplicates by code.
118    pub fn detect_all_languages(&self, path: &Path) -> Vec<LanguageInfo> {
119        let mut langs = Vec::new();
120        if let Some(dir_lang) = self.detect_from_directory(path) {
121            langs.push(dir_lang);
122        }
123        if let Some(file_lang) = self.detect_from_filename(path) {
124            langs.push(file_lang);
125        }
126        langs.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
127        langs.dedup_by(|a, b| a.code == b.code);
128        langs
129    }
130
131    fn detect_from_directory(&self, path: &Path) -> Option<LanguageInfo> {
132        for comp in path.components() {
133            if let Some(s) = comp.as_os_str().to_str() {
134                let key = s.to_lowercase();
135                if let Some(code) = self.language_codes.get(&key) {
136                    return Some(LanguageInfo {
137                        code: code.clone(),
138                        source: LanguageSource::Directory,
139                        confidence: 0.9,
140                    });
141                }
142            }
143        }
144        None
145    }
146
147    fn detect_from_filename(&self, path: &Path) -> Option<LanguageInfo> {
148        if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
149            for re in &self.filename_patterns {
150                if let Some(cap) = re.captures(name) {
151                    if let Some(m) = cap.get(1) {
152                        if let Some(code) = self.language_codes.get(m.as_str()) {
153                            return Some(LanguageInfo {
154                                code: code.clone(),
155                                source: LanguageSource::Filename,
156                                confidence: 0.8,
157                            });
158                        }
159                    }
160                }
161            }
162        }
163        None
164    }
165}
166
167#[cfg(test)]
168mod tests {
169    use super::*;
170    use std::path::Path;
171
172    #[test]
173    fn test_directory_language_detection() {
174        let det = LanguageDetector::new();
175        let p = Path::new("tc/subtitle.srt");
176        let lang = det.get_primary_language(p).unwrap();
177        assert_eq!(lang, "tc");
178    }
179
180    #[test]
181    fn test_filename_language_detection() {
182        let det = LanguageDetector::new();
183        let p = Path::new("subtitle.sc.ass");
184        let lang = det.get_primary_language(p).unwrap();
185        assert_eq!(lang, "sc");
186    }
187
188    #[test]
189    fn test_no_language_detection() {
190        let det = LanguageDetector::new();
191        let p = Path::new("subtitle.ass");
192        assert!(det.get_primary_language(p).is_none());
193    }
194}