subx_cli/core/
language.rs

1//! 語言編碼識別模組
2use regex::Regex;
3use std::collections::HashMap;
4use std::path::Path;
5
6/// 語言資訊來源
7#[derive(Debug, Clone, PartialEq)]
8pub enum LanguageSource {
9    /// 來自目錄名稱
10    Directory,
11    /// 來自檔名
12    Filename,
13    /// 來自副檔名前模式
14    Extension,
15}
16impl Default for LanguageDetector {
17    fn default() -> Self {
18        Self::new()
19    }
20}
21
22/// 語言識別結果
23#[derive(Debug, Clone)]
24pub struct LanguageInfo {
25    /// 標準化語言編碼,如 tc、sc、en
26    pub code: String,
27    /// 資訊來源
28    pub source: LanguageSource,
29    /// 識別信心度
30    pub confidence: f32,
31}
32
33/// 語言編碼偵測器
34pub struct LanguageDetector {
35    language_codes: HashMap<String, String>,
36    directory_patterns: Vec<String>,
37    filename_patterns: Vec<Regex>,
38}
39
40impl LanguageDetector {
41    /// 建立新的偵測器,初始化語言字典和模式
42    pub fn new() -> Self {
43        let mut language_codes = HashMap::new();
44        // 繁體
45        language_codes.insert("tc".to_string(), "tc".to_string());
46        language_codes.insert("繁中".to_string(), "tc".to_string());
47        language_codes.insert("繁體".to_string(), "tc".to_string());
48        language_codes.insert("cht".to_string(), "tc".to_string());
49        // 簡體
50        language_codes.insert("sc".to_string(), "sc".to_string());
51        language_codes.insert("簡中".to_string(), "sc".to_string());
52        language_codes.insert("簡體".to_string(), "sc".to_string());
53        language_codes.insert("chs".to_string(), "sc".to_string());
54        // 英文
55        language_codes.insert("en".to_string(), "en".to_string());
56        language_codes.insert("英文".to_string(), "en".to_string());
57        language_codes.insert("english".to_string(), "en".to_string());
58        // 日文、韓文等可按需擴充
59
60        let filename_patterns = vec![
61            Regex::new(r"\.([a-z]{2,3})\.").unwrap(), // .tc., .sc., .en.
62            Regex::new(r"_([a-z]{2,3})\.").unwrap(),  // _tc., _sc., _en.
63            Regex::new(r"-([a-z]{2,3})\.").unwrap(),  // -tc., -sc., -en.
64        ];
65
66        Self {
67            language_codes,
68            directory_patterns: vec!["tc".to_string(), "sc".to_string(), "en".to_string()],
69            filename_patterns,
70        }
71    }
72    /// 偵測路徑中的單一語言資訊,目錄優先,再檔名
73    pub fn detect_from_path(&self, path: &Path) -> Option<LanguageInfo> {
74        if let Some(lang) = self.detect_from_directory(path) {
75            return Some(lang);
76        }
77        if let Some(lang) = self.detect_from_filename(path) {
78            return Some(lang);
79        }
80        None
81    }
82
83    /// 偵測路徑中主要的語言編碼
84    pub fn get_primary_language(&self, path: &Path) -> Option<String> {
85        self.detect_all_languages(path)
86            .into_iter()
87            .next()
88            .map(|lang| lang.code)
89    }
90
91    /// 收集所有可能的語言資訊,並依信心度排序去重
92    pub fn detect_all_languages(&self, path: &Path) -> Vec<LanguageInfo> {
93        let mut langs = Vec::new();
94        if let Some(dir_lang) = self.detect_from_directory(path) {
95            langs.push(dir_lang);
96        }
97        if let Some(file_lang) = self.detect_from_filename(path) {
98            langs.push(file_lang);
99        }
100        langs.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
101        langs.dedup_by(|a, b| a.code == b.code);
102        langs
103    }
104
105    fn detect_from_directory(&self, path: &Path) -> Option<LanguageInfo> {
106        for comp in path.components() {
107            if let Some(s) = comp.as_os_str().to_str() {
108                let key = s.to_lowercase();
109                if let Some(code) = self.language_codes.get(&key) {
110                    return Some(LanguageInfo {
111                        code: code.clone(),
112                        source: LanguageSource::Directory,
113                        confidence: 0.9,
114                    });
115                }
116            }
117        }
118        None
119    }
120
121    fn detect_from_filename(&self, path: &Path) -> Option<LanguageInfo> {
122        if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
123            for re in &self.filename_patterns {
124                if let Some(cap) = re.captures(name) {
125                    if let Some(m) = cap.get(1) {
126                        if let Some(code) = self.language_codes.get(m.as_str()) {
127                            return Some(LanguageInfo {
128                                code: code.clone(),
129                                source: LanguageSource::Filename,
130                                confidence: 0.8,
131                            });
132                        }
133                    }
134                }
135            }
136        }
137        None
138    }
139}
140
141#[cfg(test)]
142mod tests {
143    use super::*;
144    use std::path::Path;
145
146    #[test]
147    fn test_directory_language_detection() {
148        let det = LanguageDetector::new();
149        let p = Path::new("tc/subtitle.srt");
150        let lang = det.get_primary_language(p).unwrap();
151        assert_eq!(lang, "tc");
152    }
153
154    #[test]
155    fn test_filename_language_detection() {
156        let det = LanguageDetector::new();
157        let p = Path::new("subtitle.sc.ass");
158        let lang = det.get_primary_language(p).unwrap();
159        assert_eq!(lang, "sc");
160    }
161
162    #[test]
163    fn test_no_language_detection() {
164        let det = LanguageDetector::new();
165        let p = Path::new("subtitle.ass");
166        assert!(det.get_primary_language(p).is_none());
167    }
168}