html_translation_lib/
utils.rs

1//! 工具函数模块
2//!
3//! 提供常用的工具函数和辅助功能
4
5/// HTML工具函数
6pub mod html {
7    use crate::error::{TranslationError, TranslationResult};
8    
9    /// 验证HTML格式
10    pub fn validate_html(html: &str) -> TranslationResult<()> {
11        // 基本的HTML验证
12        if html.trim().is_empty() {
13            return Err(TranslationError::ParseError("HTML内容为空".to_string()));
14        }
15        
16        // 检查基本的HTML结构
17        if !html.contains('<') || !html.contains('>') {
18            return Err(TranslationError::ParseError("无效的HTML格式".to_string()));
19        }
20        
21        Ok(())
22    }
23    
24    /// 清理HTML中的空白字符
25    pub fn clean_whitespace(html: &str) -> String {
26        // 移除多余的空白字符,但保持基本格式
27        html.lines()
28            .map(|line| line.trim())
29            .filter(|line| !line.is_empty())
30            .collect::<Vec<_>>()
31            .join("\n")
32    }
33    
34    /// 提取HTML中的纯文本内容
35    pub fn extract_text_content(html: &str) -> TranslationResult<String> {
36        use regex::Regex;
37        
38        // 移除HTML标签
39        let tag_regex = Regex::new(r"<[^>]*>").unwrap();
40        let text = tag_regex.replace_all(html, " ");
41        
42        // 清理空白字符
43        let whitespace_regex = Regex::new(r"\s+").unwrap();
44        let cleaned = whitespace_regex.replace_all(&text, " ");
45        
46        Ok(cleaned.trim().to_string())
47    }
48}
49
50/// 文本工具函数
51pub mod text {
52    /// 计算文本相似度
53    pub fn similarity(text1: &str, text2: &str) -> f32 {
54        if text1 == text2 {
55            return 1.0;
56        }
57        
58        if text1.is_empty() || text2.is_empty() {
59            return 0.0;
60        }
61        
62        // 简单的编辑距离算法
63        let len1 = text1.chars().count();
64        let len2 = text2.chars().count();
65        
66        if len1 == 0 {
67            return 0.0;
68        }
69        if len2 == 0 {
70            return 0.0;
71        }
72        
73        let mut matrix = vec![vec![0usize; len2 + 1]; len1 + 1];
74        
75        for (i, row) in matrix.iter_mut().enumerate().take(len1 + 1) {
76            row[0] = i;
77        }
78        for j in 0..=len2 {
79            matrix[0][j] = j;
80        }
81        
82        let chars1: Vec<char> = text1.chars().collect();
83        let chars2: Vec<char> = text2.chars().collect();
84        
85        for i in 1..=len1 {
86            for j in 1..=len2 {
87                let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 };
88                matrix[i][j] = std::cmp::min(
89                    std::cmp::min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1),
90                    matrix[i - 1][j - 1] + cost,
91                );
92            }
93        }
94        
95        let distance = matrix[len1][len2];
96        let max_len = std::cmp::max(len1, len2);
97        
98        1.0 - (distance as f32 / max_len as f32)
99    }
100    
101    /// 检查文本是否主要由ASCII字符组成
102    pub fn is_mostly_ascii(text: &str) -> bool {
103        if text.is_empty() {
104            return true;
105        }
106        
107        let ascii_count = text.chars().filter(|c| c.is_ascii()).count();
108        let total_count = text.chars().count();
109        
110        ascii_count as f32 / total_count as f32 > 0.8
111    }
112    
113    /// 检查文本是否包含CJK字符
114    pub fn contains_cjk(text: &str) -> bool {
115        text.chars().any(|c| {
116            // 中文字符范围
117            ('\u{4E00}'..='\u{9FFF}').contains(&c) ||
118            // 日文平假名
119            ('\u{3040}'..='\u{309F}').contains(&c) ||
120            // 日文片假名
121            ('\u{30A0}'..='\u{30FF}').contains(&c) ||
122            // 韩文字符
123            ('\u{AC00}'..='\u{D7AF}').contains(&c)
124        })
125    }
126    
127    /// 统计文本中的单词数
128    pub fn word_count(text: &str) -> usize {
129        if contains_cjk(text) {
130            // 对于CJK文本,字符数就是"单词数"
131            text.chars().filter(|c| !c.is_whitespace()).count()
132        } else {
133            // 对于其他语言,按空白字符分割
134            text.split_whitespace().count()
135        }
136    }
137    
138    /// 截断文本到指定长度,保持单词完整性
139    pub fn truncate_words(text: &str, max_length: usize) -> String {
140        if text.len() <= max_length {
141            return text.to_string();
142        }
143        
144        if contains_cjk(text) {
145            // 对于CJK文本,直接截断
146            text.chars().take(max_length).collect()
147        } else {
148            // 对于其他语言,保持单词完整性
149            let mut result = String::new();
150            for word in text.split_whitespace() {
151                if result.len() + word.len() + 1 > max_length {
152                    break;
153                }
154                if !result.is_empty() {
155                    result.push(' ');
156                }
157                result.push_str(word);
158            }
159            result
160        }
161    }
162}
163
164/// 语言检测工具
165pub mod language {
166    /// 简单的语言检测
167    pub fn detect_language(text: &str) -> Language {
168        let chars: Vec<char> = text.chars().collect();
169        let total_chars = chars.len();
170        
171        if total_chars == 0 {
172            return Language::Unknown;
173        }
174        
175        let mut chinese_count = 0;
176        let mut japanese_count = 0;
177        let mut korean_count = 0;
178        let mut cyrillic_count = 0;
179        let mut latin_count = 0;
180        
181        for &ch in &chars {
182            if ('\u{4E00}'..='\u{9FFF}').contains(&ch) {
183                chinese_count += 1;
184            } else if ('\u{3040}'..='\u{309F}').contains(&ch) || ('\u{30A0}'..='\u{30FF}').contains(&ch) {
185                japanese_count += 1;
186            } else if ('\u{AC00}'..='\u{D7AF}').contains(&ch) {
187                korean_count += 1;
188            } else if ('\u{0400}'..='\u{04FF}').contains(&ch) {
189                cyrillic_count += 1;
190            } else if ch.is_ascii_alphabetic() {
191                latin_count += 1;
192            }
193        }
194        
195        let threshold = total_chars / 10; // 10%的阈值
196        
197        if chinese_count > threshold {
198            Language::Chinese
199        } else if japanese_count > threshold {
200            Language::Japanese
201        } else if korean_count > threshold {
202            Language::Korean
203        } else if cyrillic_count > threshold {
204            Language::Russian
205        } else if latin_count > threshold {
206            Language::English
207        } else {
208            Language::Unknown
209        }
210    }
211    
212    /// 语言枚举
213    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
214    pub enum Language {
215        /// 未知语言
216        Unknown,
217        /// 英语
218        English,
219        /// 中文
220        Chinese,
221        /// 日语
222        Japanese,
223        /// 韩语
224        Korean,
225        /// 俄语
226        Russian,
227    }
228    
229    impl Language {
230        /// 获取语言代码
231        pub fn code(&self) -> &'static str {
232            match self {
233                Language::Unknown => "auto",
234                Language::English => "en",
235                Language::Chinese => "zh",
236                Language::Japanese => "ja",
237                Language::Korean => "ko",
238                Language::Russian => "ru",
239            }
240        }
241        
242        /// 获取语言名称
243        pub fn name(&self) -> &'static str {
244            match self {
245                Language::Unknown => "Unknown",
246                Language::English => "English",
247                Language::Chinese => "Chinese",
248                Language::Japanese => "Japanese", 
249                Language::Korean => "Korean",
250                Language::Russian => "Russian",
251            }
252        }
253    }
254}
255
256/// 性能监控工具
257pub mod performance {
258    use std::time::{Duration, Instant};
259    
260    /// 性能计时器
261    pub struct Timer {
262        start: Instant,
263        label: String,
264    }
265    
266    impl Timer {
267        /// 开始计时
268        pub fn start(label: &str) -> Self {
269            Self {
270                start: Instant::now(),
271                label: label.to_string(),
272            }
273        }
274        
275        /// 结束计时并返回持续时间
276        pub fn stop(self) -> Duration {
277            let duration = self.start.elapsed();
278            tracing::debug!("Timer '{}': {:?}", self.label, duration);
279            duration
280        }
281        
282        /// 获取中间时间(不停止计时器)
283        pub fn lap(&self) -> Duration {
284            self.start.elapsed()
285        }
286    }
287    
288    /// 简单的计时宏
289    #[macro_export]
290    macro_rules! time_it {
291        ($label:expr, $code:block) => {{
292            let timer = $crate::utils::performance::Timer::start($label);
293            let result = $code;
294            timer.stop();
295            result
296        }};
297    }
298}
299
300#[cfg(test)]
301mod tests {
302    use super::*;
303    
304    #[test]
305    fn test_text_similarity() {
306        assert_eq!(text::similarity("hello", "hello"), 1.0);
307        assert!(text::similarity("hello", "world") < 0.3); // 这两个单词的相似度确实不是0
308        assert!(text::similarity("hello", "hallo") > 0.5);
309    }
310    
311    #[test]
312    fn test_cjk_detection() {
313        assert!(text::contains_cjk("你好世界"));
314        assert!(text::contains_cjk("こんにちは"));
315        assert!(text::contains_cjk("안녕하세요"));
316        assert!(!text::contains_cjk("Hello World"));
317    }
318    
319    #[test]
320    fn test_language_detection() {
321        assert_eq!(language::detect_language("Hello World"), language::Language::English);
322        assert_eq!(language::detect_language("你好世界"), language::Language::Chinese);
323        assert_eq!(language::detect_language("こんにちは"), language::Language::Japanese);
324    }
325}