subx_cli/core/
language.rs1use regex::Regex;
3use std::collections::HashMap;
4use std::path::Path;
5
6#[derive(Debug, Clone, PartialEq)]
8pub enum LanguageSource {
9 Directory,
11 Filename,
13 Extension,
15}
16impl Default for LanguageDetector {
17 fn default() -> Self {
18 Self::new()
19 }
20}
21
22#[derive(Debug, Clone)]
24pub struct LanguageInfo {
25 pub code: String,
27 pub source: LanguageSource,
29 pub confidence: f32,
31}
32
33pub struct LanguageDetector {
35 language_codes: HashMap<String, String>,
36 directory_patterns: Vec<String>,
37 filename_patterns: Vec<Regex>,
38}
39
40impl LanguageDetector {
41 pub fn new() -> Self {
43 let mut language_codes = HashMap::new();
44 language_codes.insert("tc".to_string(), "tc".to_string());
46 language_codes.insert("繁中".to_string(), "tc".to_string());
47 language_codes.insert("繁體".to_string(), "tc".to_string());
48 language_codes.insert("cht".to_string(), "tc".to_string());
49 language_codes.insert("sc".to_string(), "sc".to_string());
51 language_codes.insert("簡中".to_string(), "sc".to_string());
52 language_codes.insert("簡體".to_string(), "sc".to_string());
53 language_codes.insert("chs".to_string(), "sc".to_string());
54 language_codes.insert("en".to_string(), "en".to_string());
56 language_codes.insert("英文".to_string(), "en".to_string());
57 language_codes.insert("english".to_string(), "en".to_string());
58 let filename_patterns = vec![
61 Regex::new(r"\.([a-z]{2,3})\.").unwrap(), Regex::new(r"_([a-z]{2,3})\.").unwrap(), Regex::new(r"-([a-z]{2,3})\.").unwrap(), ];
65
66 Self {
67 language_codes,
68 directory_patterns: vec!["tc".to_string(), "sc".to_string(), "en".to_string()],
69 filename_patterns,
70 }
71 }
72 pub fn detect_from_path(&self, path: &Path) -> Option<LanguageInfo> {
74 if let Some(lang) = self.detect_from_directory(path) {
75 return Some(lang);
76 }
77 if let Some(lang) = self.detect_from_filename(path) {
78 return Some(lang);
79 }
80 None
81 }
82
83 pub fn get_primary_language(&self, path: &Path) -> Option<String> {
85 self.detect_all_languages(path)
86 .into_iter()
87 .next()
88 .map(|lang| lang.code)
89 }
90
91 pub fn detect_all_languages(&self, path: &Path) -> Vec<LanguageInfo> {
93 let mut langs = Vec::new();
94 if let Some(dir_lang) = self.detect_from_directory(path) {
95 langs.push(dir_lang);
96 }
97 if let Some(file_lang) = self.detect_from_filename(path) {
98 langs.push(file_lang);
99 }
100 langs.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
101 langs.dedup_by(|a, b| a.code == b.code);
102 langs
103 }
104
105 fn detect_from_directory(&self, path: &Path) -> Option<LanguageInfo> {
106 for comp in path.components() {
107 if let Some(s) = comp.as_os_str().to_str() {
108 let key = s.to_lowercase();
109 if let Some(code) = self.language_codes.get(&key) {
110 return Some(LanguageInfo {
111 code: code.clone(),
112 source: LanguageSource::Directory,
113 confidence: 0.9,
114 });
115 }
116 }
117 }
118 None
119 }
120
121 fn detect_from_filename(&self, path: &Path) -> Option<LanguageInfo> {
122 if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
123 for re in &self.filename_patterns {
124 if let Some(cap) = re.captures(name) {
125 if let Some(m) = cap.get(1) {
126 if let Some(code) = self.language_codes.get(m.as_str()) {
127 return Some(LanguageInfo {
128 code: code.clone(),
129 source: LanguageSource::Filename,
130 confidence: 0.8,
131 });
132 }
133 }
134 }
135 }
136 }
137 None
138 }
139}
140
141#[cfg(test)]
142mod tests {
143 use super::*;
144 use std::path::Path;
145
146 #[test]
147 fn test_directory_language_detection() {
148 let det = LanguageDetector::new();
149 let p = Path::new("tc/subtitle.srt");
150 let lang = det.get_primary_language(p).unwrap();
151 assert_eq!(lang, "tc");
152 }
153
154 #[test]
155 fn test_filename_language_detection() {
156 let det = LanguageDetector::new();
157 let p = Path::new("subtitle.sc.ass");
158 let lang = det.get_primary_language(p).unwrap();
159 assert_eq!(lang, "sc");
160 }
161
162 #[test]
163 fn test_no_language_detection() {
164 let det = LanguageDetector::new();
165 let p = Path::new("subtitle.ass");
166 assert!(det.get_primary_language(p).is_none());
167 }
168}