html_translation_lib/
utils.rs1pub mod html {
7 use crate::error::{TranslationError, TranslationResult};
8
9 pub fn validate_html(html: &str) -> TranslationResult<()> {
11 if html.trim().is_empty() {
13 return Err(TranslationError::ParseError("HTML内容为空".to_string()));
14 }
15
16 if !html.contains('<') || !html.contains('>') {
18 return Err(TranslationError::ParseError("无效的HTML格式".to_string()));
19 }
20
21 Ok(())
22 }
23
24 pub fn clean_whitespace(html: &str) -> String {
26 html.lines()
28 .map(|line| line.trim())
29 .filter(|line| !line.is_empty())
30 .collect::<Vec<_>>()
31 .join("\n")
32 }
33
34 pub fn extract_text_content(html: &str) -> TranslationResult<String> {
36 use regex::Regex;
37
38 let tag_regex = Regex::new(r"<[^>]*>").unwrap();
40 let text = tag_regex.replace_all(html, " ");
41
42 let whitespace_regex = Regex::new(r"\s+").unwrap();
44 let cleaned = whitespace_regex.replace_all(&text, " ");
45
46 Ok(cleaned.trim().to_string())
47 }
48}
49
50pub mod text {
52 pub fn similarity(text1: &str, text2: &str) -> f32 {
54 if text1 == text2 {
55 return 1.0;
56 }
57
58 if text1.is_empty() || text2.is_empty() {
59 return 0.0;
60 }
61
62 let len1 = text1.chars().count();
64 let len2 = text2.chars().count();
65
66 if len1 == 0 {
67 return 0.0;
68 }
69 if len2 == 0 {
70 return 0.0;
71 }
72
73 let mut matrix = vec![vec![0usize; len2 + 1]; len1 + 1];
74
75 for (i, row) in matrix.iter_mut().enumerate().take(len1 + 1) {
76 row[0] = i;
77 }
78 for j in 0..=len2 {
79 matrix[0][j] = j;
80 }
81
82 let chars1: Vec<char> = text1.chars().collect();
83 let chars2: Vec<char> = text2.chars().collect();
84
85 for i in 1..=len1 {
86 for j in 1..=len2 {
87 let cost = if chars1[i - 1] == chars2[j - 1] { 0 } else { 1 };
88 matrix[i][j] = std::cmp::min(
89 std::cmp::min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1),
90 matrix[i - 1][j - 1] + cost,
91 );
92 }
93 }
94
95 let distance = matrix[len1][len2];
96 let max_len = std::cmp::max(len1, len2);
97
98 1.0 - (distance as f32 / max_len as f32)
99 }
100
101 pub fn is_mostly_ascii(text: &str) -> bool {
103 if text.is_empty() {
104 return true;
105 }
106
107 let ascii_count = text.chars().filter(|c| c.is_ascii()).count();
108 let total_count = text.chars().count();
109
110 ascii_count as f32 / total_count as f32 > 0.8
111 }
112
113 pub fn contains_cjk(text: &str) -> bool {
115 text.chars().any(|c| {
116 ('\u{4E00}'..='\u{9FFF}').contains(&c) ||
118 ('\u{3040}'..='\u{309F}').contains(&c) ||
120 ('\u{30A0}'..='\u{30FF}').contains(&c) ||
122 ('\u{AC00}'..='\u{D7AF}').contains(&c)
124 })
125 }
126
127 pub fn word_count(text: &str) -> usize {
129 if contains_cjk(text) {
130 text.chars().filter(|c| !c.is_whitespace()).count()
132 } else {
133 text.split_whitespace().count()
135 }
136 }
137
138 pub fn truncate_words(text: &str, max_length: usize) -> String {
140 if text.len() <= max_length {
141 return text.to_string();
142 }
143
144 if contains_cjk(text) {
145 text.chars().take(max_length).collect()
147 } else {
148 let mut result = String::new();
150 for word in text.split_whitespace() {
151 if result.len() + word.len() + 1 > max_length {
152 break;
153 }
154 if !result.is_empty() {
155 result.push(' ');
156 }
157 result.push_str(word);
158 }
159 result
160 }
161 }
162}
163
164pub mod language {
166 pub fn detect_language(text: &str) -> Language {
168 let chars: Vec<char> = text.chars().collect();
169 let total_chars = chars.len();
170
171 if total_chars == 0 {
172 return Language::Unknown;
173 }
174
175 let mut chinese_count = 0;
176 let mut japanese_count = 0;
177 let mut korean_count = 0;
178 let mut cyrillic_count = 0;
179 let mut latin_count = 0;
180
181 for &ch in &chars {
182 if ('\u{4E00}'..='\u{9FFF}').contains(&ch) {
183 chinese_count += 1;
184 } else if ('\u{3040}'..='\u{309F}').contains(&ch) || ('\u{30A0}'..='\u{30FF}').contains(&ch) {
185 japanese_count += 1;
186 } else if ('\u{AC00}'..='\u{D7AF}').contains(&ch) {
187 korean_count += 1;
188 } else if ('\u{0400}'..='\u{04FF}').contains(&ch) {
189 cyrillic_count += 1;
190 } else if ch.is_ascii_alphabetic() {
191 latin_count += 1;
192 }
193 }
194
195 let threshold = total_chars / 10; if chinese_count > threshold {
198 Language::Chinese
199 } else if japanese_count > threshold {
200 Language::Japanese
201 } else if korean_count > threshold {
202 Language::Korean
203 } else if cyrillic_count > threshold {
204 Language::Russian
205 } else if latin_count > threshold {
206 Language::English
207 } else {
208 Language::Unknown
209 }
210 }
211
212 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
214 pub enum Language {
215 Unknown,
217 English,
219 Chinese,
221 Japanese,
223 Korean,
225 Russian,
227 }
228
229 impl Language {
230 pub fn code(&self) -> &'static str {
232 match self {
233 Language::Unknown => "auto",
234 Language::English => "en",
235 Language::Chinese => "zh",
236 Language::Japanese => "ja",
237 Language::Korean => "ko",
238 Language::Russian => "ru",
239 }
240 }
241
242 pub fn name(&self) -> &'static str {
244 match self {
245 Language::Unknown => "Unknown",
246 Language::English => "English",
247 Language::Chinese => "Chinese",
248 Language::Japanese => "Japanese",
249 Language::Korean => "Korean",
250 Language::Russian => "Russian",
251 }
252 }
253 }
254}
255
256pub mod performance {
258 use std::time::{Duration, Instant};
259
260 pub struct Timer {
262 start: Instant,
263 label: String,
264 }
265
266 impl Timer {
267 pub fn start(label: &str) -> Self {
269 Self {
270 start: Instant::now(),
271 label: label.to_string(),
272 }
273 }
274
275 pub fn stop(self) -> Duration {
277 let duration = self.start.elapsed();
278 tracing::debug!("Timer '{}': {:?}", self.label, duration);
279 duration
280 }
281
282 pub fn lap(&self) -> Duration {
284 self.start.elapsed()
285 }
286 }
287
288 #[macro_export]
290 macro_rules! time_it {
291 ($label:expr, $code:block) => {{
292 let timer = $crate::utils::performance::Timer::start($label);
293 let result = $code;
294 timer.stop();
295 result
296 }};
297 }
298}
299
300#[cfg(test)]
301mod tests {
302 use super::*;
303
304 #[test]
305 fn test_text_similarity() {
306 assert_eq!(text::similarity("hello", "hello"), 1.0);
307 assert!(text::similarity("hello", "world") < 0.3); assert!(text::similarity("hello", "hallo") > 0.5);
309 }
310
311 #[test]
312 fn test_cjk_detection() {
313 assert!(text::contains_cjk("你好世界"));
314 assert!(text::contains_cjk("こんにちは"));
315 assert!(text::contains_cjk("안녕하세요"));
316 assert!(!text::contains_cjk("Hello World"));
317 }
318
319 #[test]
320 fn test_language_detection() {
321 assert_eq!(language::detect_language("Hello World"), language::Language::English);
322 assert_eq!(language::detect_language("你好世界"), language::Language::Chinese);
323 assert_eq!(language::detect_language("こんにちは"), language::Language::Japanese);
324 }
325}