html_translation_lib/pipeline/
filter.rs1#![allow(dead_code)] use crate::config::TranslationConfig;
8use regex::Regex;
9use std::collections::HashSet;
10
11pub struct TextFilter {
13 min_length: usize,
15
16 skip_links: bool,
18
19 skip_code_blocks: bool,
21
22 custom_filters: Vec<Regex>,
24
25 number_pattern: Regex,
27
28 url_pattern: Regex,
30
31 email_pattern: Regex,
33
34 code_pattern: Regex,
36
37 skip_words: HashSet<String>,
39}
40
41impl TextFilter {
42 pub fn new(config: &TranslationConfig) -> Self {
44 let custom_filters: Vec<Regex> = config
45 .custom_filters
46 .iter()
47 .filter_map(|pattern| Regex::new(pattern).ok())
48 .collect();
49
50 let mut skip_words = HashSet::new();
51 skip_words.insert("HTML".to_string());
53 skip_words.insert("CSS".to_string());
54 skip_words.insert("JavaScript".to_string());
55 skip_words.insert("JSON".to_string());
56 skip_words.insert("XML".to_string());
57 skip_words.insert("API".to_string());
58 skip_words.insert("HTTP".to_string());
59 skip_words.insert("HTTPS".to_string());
60 skip_words.insert("URL".to_string());
61 skip_words.insert("URI".to_string());
62 skip_words.insert("UUID".to_string());
63 skip_words.insert("ID".to_string());
64
65 Self {
66 min_length: config.min_text_length,
67 skip_links: config.skip_links,
68 skip_code_blocks: config.skip_code_blocks,
69 custom_filters,
70 number_pattern: Regex::new(r"^\d+(\.\d+)?$").unwrap(),
71 url_pattern: Regex::new(r"^https?://[^\s]+$").unwrap(),
72 email_pattern: Regex::new(r"^[^\s@]+@[^\s@]+\.[^\s@]+$").unwrap(),
73 code_pattern: Regex::new(r"^[a-zA-Z_][a-zA-Z0-9_]*\(\)$|^[a-zA-Z_][a-zA-Z0-9_.]*$").unwrap(),
74 skip_words,
75 }
76 }
77
78 pub fn should_translate(&self, text: &str) -> bool {
80 let trimmed = text.trim();
81
82 if trimmed.len() < self.min_length {
84 return false;
85 }
86
87 if self.number_pattern.is_match(trimmed) {
89 return false;
90 }
91
92 if self.url_pattern.is_match(trimmed) {
94 return false;
95 }
96
97 if self.email_pattern.is_match(trimmed) {
99 return false;
100 }
101
102 if self.skip_code_blocks && self.code_pattern.is_match(trimmed) {
104 return false;
105 }
106
107 if self.skip_words.contains(&trimmed.to_uppercase()) {
109 return false;
110 }
111
112 let alpha_count = trimmed.chars().filter(|c| c.is_alphabetic()).count();
114 let total_count = trimmed.chars().count();
115 if total_count > 0 {
116 let alpha_ratio = alpha_count as f32 / total_count as f32;
117 if alpha_ratio < 0.3 {
118 return false;
119 }
120 }
121
122 for filter in &self.custom_filters {
124 if filter.is_match(trimmed) {
125 return false;
126 }
127 }
128
129 if self.is_version_like(trimmed) {
131 return false;
132 }
133
134 if self.is_html_entity(trimmed) {
136 return false;
137 }
138
139 true
140 }
141
142 pub fn analyze_text(&self, text: &str) -> TextAnalysis {
144 let should_translate = self.should_translate(text);
145 let confidence = self.calculate_confidence(text);
146 let language_hint = self.detect_language_hint(text);
147
148 TextAnalysis {
149 should_translate,
150 confidence,
151 language_hint,
152 reasons: self.get_skip_reasons(text),
153 }
154 }
155
156 fn is_version_like(&self, text: &str) -> bool {
158 let version_pattern = Regex::new(r"^v?\d+(\.\d+)*(-[a-zA-Z0-9]+)?$").unwrap();
160 version_pattern.is_match(text)
161 }
162
163 fn is_html_entity(&self, text: &str) -> bool {
165 let entity_pattern = Regex::new(r"^&[a-zA-Z0-9#]+;$").unwrap();
167 entity_pattern.is_match(text)
168 }
169
170 fn calculate_confidence(&self, text: &str) -> f32 {
172 let trimmed = text.trim();
173 let mut score = 0.5; match trimmed.len() {
177 0..=2 => score -= 0.3,
178 3..=5 => score -= 0.1,
179 6..=20 => score += 0.1,
180 21..=100 => score += 0.2,
181 _ => score += 0.1,
182 }
183
184 let alpha_count = trimmed.chars().filter(|c| c.is_alphabetic()).count();
186 let total_count = trimmed.chars().count();
187 let alpha_ratio = alpha_count as f32 / total_count as f32;
188 score += (alpha_ratio - 0.5) * 0.4;
189
190 if self.number_pattern.is_match(trimmed) {
192 score -= 0.4;
193 }
194
195 if self.url_pattern.is_match(trimmed) {
196 score -= 0.5;
197 }
198
199 score.clamp(0.0, 1.0)
201 }
202
203 fn detect_language_hint(&self, text: &str) -> LanguageHint {
205 let trimmed = text.trim();
206
207 let has_chinese = trimmed.chars().any(|c| {
209 ('\u{4E00}'..='\u{9FFF}').contains(&c) });
211
212 let has_japanese = trimmed.chars().any(|c| {
213 ('\u{3040}'..='\u{309F}').contains(&c) || ('\u{30A0}'..='\u{30FF}').contains(&c) });
216
217 let has_korean = trimmed.chars().any(|c| {
218 ('\u{AC00}'..='\u{D7AF}').contains(&c) });
220
221 let has_cyrillic = trimmed.chars().any(|c| {
222 ('\u{0400}'..='\u{04FF}').contains(&c) });
224
225 if has_chinese {
226 LanguageHint::Chinese
227 } else if has_japanese {
228 LanguageHint::Japanese
229 } else if has_korean {
230 LanguageHint::Korean
231 } else if has_cyrillic {
232 LanguageHint::Russian
233 } else if trimmed.is_ascii() {
234 LanguageHint::English
235 } else {
236 LanguageHint::Unknown
237 }
238 }
239
240 fn get_skip_reasons(&self, text: &str) -> Vec<String> {
242 let mut reasons = Vec::new();
243 let trimmed = text.trim();
244
245 if trimmed.len() < self.min_length {
246 reasons.push(format!("文本长度过短 ({})", trimmed.len()));
247 }
248
249 if self.number_pattern.is_match(trimmed) {
250 reasons.push("纯数字".to_string());
251 }
252
253 if self.url_pattern.is_match(trimmed) {
254 reasons.push("URL地址".to_string());
255 }
256
257 if self.email_pattern.is_match(trimmed) {
258 reasons.push("邮箱地址".to_string());
259 }
260
261 if self.skip_words.contains(&trimmed.to_uppercase()) {
262 reasons.push("技术术语".to_string());
263 }
264
265 if self.is_version_like(trimmed) {
266 reasons.push("版本号格式".to_string());
267 }
268
269 if self.is_html_entity(trimmed) {
270 reasons.push("HTML实体".to_string());
271 }
272
273 reasons
274 }
275}
276
277pub enum FilterRule {
279 MinLength(usize),
281
282 Regex(Regex),
284
285 SkipType(String),
287
288 Custom(Box<dyn Fn(&str) -> bool>),
290}
291
292#[derive(Debug, Clone)]
294pub struct TextAnalysis {
295 pub should_translate: bool,
297
298 pub confidence: f32,
300
301 pub language_hint: LanguageHint,
303
304 pub reasons: Vec<String>,
306}
307
308#[derive(Debug, Clone, PartialEq)]
310pub enum LanguageHint {
311 Unknown,
313
314 English,
316
317 Chinese,
319
320 Japanese,
322
323 Korean,
325
326 Russian,
328}