subx_cli/core/
language.rs

1//! Language detection module.
2//!
3//! Provides utilities to detect language codes from file paths and names,
4//! using directory names, filename patterns, and file extensions.
5//!
6//! # Examples
7//!
8//! ```rust
9//! use subx_cli::core::language::LanguageDetector;
10//! use std::path::Path;
11//!
12//! let detector = LanguageDetector::new();
13//! let code = detector.get_primary_language(Path::new("subtitle.sc.srt")).unwrap();
14//! assert_eq!(code, "sc");
15//! ```
16use regex::Regex;
17use std::collections::HashMap;
18use std::path::Path;
19
20/// Source of detected language information.
21#[derive(Debug, Clone, PartialEq)]
22pub enum LanguageSource {
23    /// Derived from a parent directory name.
24    Directory,
25    /// Derived from the file name pattern.
26    Filename,
27    /// Derived from the file extension or naming convention.
28    Extension,
29}
30impl Default for LanguageDetector {
31    fn default() -> Self {
32        Self::new()
33    }
34}
35
36/// Detected language information, including code, source, and confidence.
37#[derive(Debug, Clone)]
38pub struct LanguageInfo {
39    /// Standardized language code (e.g., "tc", "sc", "en").
40    pub code: String,
41    /// Origin of the language detection result.
42    pub source: LanguageSource,
43    /// Confidence score of the detection (0.0 to 1.0).
44    pub confidence: f32,
45}
46
47/// Detector for identifying language codes from filesystem paths.
48pub struct LanguageDetector {
49    language_codes: HashMap<String, String>,
50    directory_patterns: Vec<String>,
51    filename_patterns: Vec<Regex>,
52}
53
54impl LanguageDetector {
55    /// Create a new `LanguageDetector` with default language mappings and patterns.
56    ///
57    /// Initializes internal dictionaries and regex patterns for detection.
58    pub fn new() -> Self {
59        //! Do not translate these language codes to English rustdoc!!!
60        let mut language_codes = HashMap::new();
61        // Traditional Chinese
62        language_codes.insert("tc".to_string(), "tc".to_string());
63        language_codes.insert("繁中".to_string(), "tc".to_string()); // Traditional Chinese (zh-Hant)
64        language_codes.insert("繁體".to_string(), "tc".to_string()); // Traditional Chinese (zh-Hant)
65        language_codes.insert("cht".to_string(), "tc".to_string());
66        // Simplified Chinese
67        language_codes.insert("sc".to_string(), "sc".to_string());
68        language_codes.insert("简中".to_string(), "sc".to_string()); // Simplified Chinese (zh-Hans)
69        language_codes.insert("简体".to_string(), "sc".to_string()); // Simplified Chinese (zh-Hans)
70        language_codes.insert("chs".to_string(), "sc".to_string());
71        // English
72        language_codes.insert("en".to_string(), "en".to_string());
73        language_codes.insert("英文".to_string(), "en".to_string()); // English
74        language_codes.insert("english".to_string(), "en".to_string());
75        // Additional languages (e.g., Japanese, Korean) can be added as needed.
76
77        let filename_patterns = vec![
78            Regex::new(r"\.([a-z]{2,3})\.").unwrap(), // .tc., .sc., .en.
79            Regex::new(r"_([a-z]{2,3})\.").unwrap(),  // _tc., _sc., _en.
80            Regex::new(r"-([a-z]{2,3})\.").unwrap(),  // -tc., -sc., -en.
81        ];
82
83        Self {
84            language_codes,
85            directory_patterns: vec!["tc".to_string(), "sc".to_string(), "en".to_string()],
86            filename_patterns,
87        }
88    }
89    /// Detect a single language information from the given path.
90    ///
91    /// # Behavior
92    ///
93    /// Attempts detection by directory name first, then by filename pattern.
94    pub fn detect_from_path(&self, path: &Path) -> Option<LanguageInfo> {
95        if let Some(lang) = self.detect_from_directory(path) {
96            return Some(lang);
97        }
98        if let Some(lang) = self.detect_from_filename(path) {
99            return Some(lang);
100        }
101        None
102    }
103
104    /// Return the primary detected language code for the provided path.
105    ///
106    /// # Returns
107    ///
108    /// `Some(code)` if detected, otherwise `None`.
109    pub fn get_primary_language(&self, path: &Path) -> Option<String> {
110        self.detect_all_languages(path)
111            .into_iter()
112            .next()
113            .map(|lang| lang.code)
114    }
115
116    /// Collect all potential language detections from the path.
117    ///
118    /// Sorts results by confidence and removes duplicates by code.
119    pub fn detect_all_languages(&self, path: &Path) -> Vec<LanguageInfo> {
120        let mut langs = Vec::new();
121        if let Some(dir_lang) = self.detect_from_directory(path) {
122            langs.push(dir_lang);
123        }
124        if let Some(file_lang) = self.detect_from_filename(path) {
125            langs.push(file_lang);
126        }
127        langs.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
128        langs.dedup_by(|a, b| a.code == b.code);
129        langs
130    }
131
132    fn detect_from_directory(&self, path: &Path) -> Option<LanguageInfo> {
133        for comp in path.components() {
134            if let Some(s) = comp.as_os_str().to_str() {
135                let key = s.to_lowercase();
136                if let Some(code) = self.language_codes.get(&key) {
137                    return Some(LanguageInfo {
138                        code: code.clone(),
139                        source: LanguageSource::Directory,
140                        confidence: 0.9,
141                    });
142                }
143            }
144        }
145        None
146    }
147
148    fn detect_from_filename(&self, path: &Path) -> Option<LanguageInfo> {
149        if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
150            for re in &self.filename_patterns {
151                if let Some(cap) = re.captures(name) {
152                    if let Some(m) = cap.get(1) {
153                        if let Some(code) = self.language_codes.get(m.as_str()) {
154                            return Some(LanguageInfo {
155                                code: code.clone(),
156                                source: LanguageSource::Filename,
157                                confidence: 0.8,
158                            });
159                        }
160                    }
161                }
162            }
163        }
164        None
165    }
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171    use std::path::Path;
172
173    #[test]
174    fn test_directory_language_detection() {
175        let det = LanguageDetector::new();
176        let p = Path::new("tc/subtitle.srt");
177        let lang = det.get_primary_language(p).unwrap();
178        assert_eq!(lang, "tc");
179    }
180
181    #[test]
182    fn test_filename_language_detection() {
183        let det = LanguageDetector::new();
184        let p = Path::new("subtitle.sc.ass");
185        let lang = det.get_primary_language(p).unwrap();
186        assert_eq!(lang, "sc");
187    }
188
189    #[test]
190    fn test_no_language_detection() {
191        let det = LanguageDetector::new();
192        let p = Path::new("subtitle.ass");
193        assert!(det.get_primary_language(p).is_none());
194    }
195}