1use std::collections::HashSet;
17use std::sync::LazyLock;
18use std::time::Instant;
19
20#[cfg(feature = "translation")]
21use rayon::prelude::*;
22use regex::Regex;
23
24#[derive(Debug, Clone, PartialEq)]
30pub struct TextItem {
31 pub text: String,
33 pub text_type: TextType,
35 pub priority: TextPriority,
37 pub complexity: f32,
39 pub location: String,
41}
42
43#[derive(Debug, Clone, PartialEq, Eq, Hash)]
45pub enum TextType {
46 Title, Content, Link, Button, Label, Alt, Placeholder,Other, }
55
56#[derive(Debug, Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
58pub enum TextPriority {
59 Low = 1,
60 Normal = 2,
61 High = 3,
62 Critical = 4,
63}
64
65#[derive(Debug, Clone)]
67pub struct Batch {
68 pub id: usize,
70 pub items: Vec<TextItem>,
72 pub priority: TextPriority,
74 pub estimated_chars: usize,
76 pub created_at: Instant,
78}
79
80#[derive(Debug, Clone)]
82pub struct TextAnalysis {
83 pub should_translate: bool,
85 pub translatability_score: f32,
87 pub text_type: TextType,
89 pub priority: TextPriority,
91 pub features: Vec<String>,
93}
94
95static URL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
101 Regex::new(r"https?://[^\s]+|www\.[^\s]+|[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
102 .expect("URL regex should be valid")
103});
104
105static EMAIL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
107 Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
108 .expect("Email regex should be valid")
109});
110
111static CODE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
113 Regex::new(r"^\s*[{}\[\]();,]|[=+\-*/%<>!&|^~]|\b(function|var|let|const|if|else|for|while|return|class|def|import|export)\b")
114 .expect("Code regex should be valid")
115});
116
117static CHINESE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
119 Regex::new(r"[\u4e00-\u9fff]")
120 .expect("Chinese regex should be valid")
121});
122
123static FUNCTIONAL_WORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
125 [
126 "ok", "yes", "no", "on", "off", "true", "false", "null", "none",
127 "home", "back", "next", "prev", "close", "open", "save", "load",
128 "new", "edit", "delete", "add", "remove", "clear", "reset",
129 "login", "logout", "signup", "signin", "submit", "cancel",
130 "confirm", "apply", "ok", "done", "finish", "start", "stop",
131 "play", "pause", "resume", "skip", "retry", "refresh", "reload",
132 "search", "filter", "sort", "view", "show", "hide", "toggle",
133 "expand", "collapse", "minimize", "maximize", "restore",
134 "copy", "paste", "cut", "undo", "redo", "select", "all",
135 "help", "info", "about", "contact", "privacy", "terms",
136 "settings", "config", "options", "preferences", "profile",
137 "account", "user", "admin", "guest", "public", "private",
138 "draft", "published", "archived", "deleted", "active", "inactive",
139 "enabled", "disabled", "online", "offline", "available", "busy",
140 "free", "premium", "pro", "basic", "standard", "advanced",
141 "low", "medium", "high", "max", "min", "auto", "manual",
142 ].into_iter().collect()
143});
144
145pub mod analyzers {
151 use super::*;
152
153 pub fn check_length(text: &str) -> bool {
155 let trimmed = text.trim();
156 !trimmed.is_empty() && trimmed.len() >= 2
157 }
158
159 pub fn is_whitespace_only(text: &str) -> bool {
161 text.trim().is_empty()
162 }
163
164 pub fn is_numeric_only(text: &str) -> bool {
166 text.trim().chars().all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ')
167 }
168
169 pub fn is_url(text: &str) -> bool {
171 URL_REGEX.is_match(text.trim())
172 }
173
174 pub fn is_email(text: &str) -> bool {
176 EMAIL_REGEX.is_match(text.trim())
177 }
178
179 pub fn is_code(text: &str) -> bool {
181 let trimmed = text.trim();
182 CODE_REGEX.is_match(trimmed) ||
183 trimmed.starts_with("function") ||
184 trimmed.contains("=>") ||
185 (trimmed.contains('{') && trimmed.contains('}'))
186 }
187
188 pub fn chinese_char_ratio(text: &str) -> f32 {
190 let total_chars = text.chars().count();
191 if total_chars == 0 {
192 return 0.0;
193 }
194
195 let chinese_chars = CHINESE_REGEX.find_iter(text).count();
196 chinese_chars as f32 / total_chars as f32
197 }
198
199 pub fn is_functional_word(text: &str) -> bool {
201 let trimmed = text.trim().to_ascii_lowercase();
202 FUNCTIONAL_WORDS.contains(trimmed.as_str())
203 }
204
205 pub fn alphabetic_ratio(text: &str) -> f32 {
207 let total_chars = text.chars().count();
208 if total_chars == 0 {
209 return 0.0;
210 }
211
212 let alphabetic_chars = text.chars().filter(|c| c.is_alphabetic()).count();
213 alphabetic_chars as f32 / total_chars as f32
214 }
215
216 pub fn special_char_density(text: &str) -> f32 {
218 let total_chars = text.chars().count();
219 if total_chars == 0 {
220 return 0.0;
221 }
222
223 let special_chars = text.chars()
224 .filter(|c| !c.is_alphanumeric() && !c.is_whitespace())
225 .count();
226 special_chars as f32 / total_chars as f32
227 }
228}
229
230pub struct TextFilter;
236
237impl TextFilter {
238 pub fn new() -> Self {
239 Self
240 }
241
242 pub fn should_translate(&self, text: &str) -> bool {
244 self.analyze_text(text).should_translate
245 }
246
247 pub fn analyze_text(&self, text: &str) -> TextAnalysis {
249 use analyzers::*;
250
251 if !check_length(text) || is_whitespace_only(text) {
253 return TextAnalysis {
254 should_translate: false,
255 translatability_score: 0.0,
256 text_type: TextType::Other,
257 priority: TextPriority::Low,
258 features: vec!["too_short".to_string()],
259 };
260 }
261
262 let mut features = Vec::new();
264 let mut score = 1.0f32;
265
266 if is_numeric_only(text) {
267 features.push("numeric".to_string());
268 score *= 0.1;
269 }
270
271 if is_url(text) {
272 features.push("url".to_string());
273 score *= 0.0;
274 }
275
276 if is_email(text) {
277 features.push("email".to_string());
278 score *= 0.0;
279 }
280
281 if is_code(text) {
282 features.push("code".to_string());
283 score *= 0.2;
284 }
285
286 if is_functional_word(text) {
287 features.push("functional".to_string());
288 score *= 0.3;
289 }
290
291 let chinese_ratio = chinese_char_ratio(text);
293 if chinese_ratio > 0.3 {
294 features.push("chinese".to_string());
295 score *= 0.1; }
297
298 let alphabetic_ratio = alphabetic_ratio(text);
299 if alphabetic_ratio < 0.3 {
300 features.push("low_alphabetic".to_string());
301 score *= 0.5;
302 }
303
304 let special_density = special_char_density(text);
305 if special_density > 0.5 {
306 features.push("high_special_chars".to_string());
307 score *= 0.4;
308 }
309
310 let text_type = self.infer_text_type(text);
312 let priority = self.infer_priority(&text_type, text.len());
313
314 TextAnalysis {
315 should_translate: score > 0.5,
316 translatability_score: score,
317 text_type,
318 priority,
319 features,
320 }
321 }
322
323 fn infer_text_type(&self, text: &str) -> TextType {
325 let len = text.len();
326 let has_punctuation = text.chars().any(|c| ".!?。!?".contains(c));
327
328 if len < 10 && !has_punctuation {
329 TextType::Label
330 } else if len < 50 && !has_punctuation {
331 TextType::Button
332 } else if has_punctuation && len > 20 {
333 TextType::Content
334 } else {
335 TextType::Other
336 }
337 }
338
339 fn infer_priority(&self, text_type: &TextType, length: usize) -> TextPriority {
341 match text_type {
342 TextType::Title => TextPriority::Critical,
343 TextType::Content if length > 100 => TextPriority::High,
344 TextType::Content => TextPriority::Normal,
345 TextType::Button | TextType::Link => TextPriority::High,
346 TextType::Label | TextType::Alt => TextPriority::Normal,
347 TextType::Placeholder => TextPriority::Low,
348 TextType::Other => TextPriority::Low,
349 }
350 }
351
352 #[cfg(feature = "translation")]
354 pub fn filter_texts_parallel(&self, texts: Vec<String>) -> Vec<String> {
355 texts
356 .into_par_iter()
357 .filter(|text| self.should_translate(text))
358 .collect()
359 }
360
361 #[cfg(not(feature = "translation"))]
363 pub fn filter_texts_parallel(&self, texts: Vec<String>) -> Vec<String> {
364 texts
365 .into_iter()
366 .filter(|text| self.should_translate(text))
367 .collect()
368 }
369}
370
371impl Default for TextFilter {
372 fn default() -> Self {
373 Self::new()
374 }
375}
376
377pub struct BatchManager {
383 next_id: std::sync::atomic::AtomicUsize,
384}
385
386impl BatchManager {
387 pub fn new() -> Self {
388 Self {
389 next_id: std::sync::atomic::AtomicUsize::new(1),
390 }
391 }
392
393 pub fn create_batches(&self, items: Vec<TextItem>) -> Vec<Batch> {
395 items
397 .into_iter()
398 .filter(|item| !item.text.trim().is_empty())
399 .collect::<Vec<_>>()
400 .pipe(|items| self.group_by_priority(items))
401 .pipe(|groups| self.optimize_batch_sizes(groups))
402 .pipe(|batches| self.sort_by_priority(batches))
403 }
404
405 fn group_by_priority(&self, items: Vec<TextItem>) -> Vec<Vec<TextItem>> {
407 use std::collections::HashMap;
408
409 let mut groups: HashMap<TextPriority, Vec<TextItem>> = HashMap::new();
410
411 for item in items {
412 groups.entry(item.priority.clone()).or_default().push(item);
413 }
414
415 let mut result: Vec<_> = groups.into_values().collect();
417 result.sort_by(|a, b| {
418 b.first().map(|item| &item.priority)
419 .cmp(&a.first().map(|item| &item.priority))
420 });
421
422 result
423 }
424
425 fn optimize_batch_sizes(&self, groups: Vec<Vec<TextItem>>) -> Vec<Batch> {
427 const MAX_BATCH_SIZE: usize = 50;
428 const MIN_BATCH_SIZE: usize = 5;
429
430 groups
431 .into_iter()
432 .flat_map(|group| {
433 if group.len() <= MAX_BATCH_SIZE {
434 vec![group]
435 } else {
436 group
438 .chunks(MAX_BATCH_SIZE)
439 .map(|chunk| chunk.to_vec())
440 .collect()
441 }
442 })
443 .filter(|group| group.len() >= MIN_BATCH_SIZE ||
444 group.iter().any(|item| item.priority >= TextPriority::High))
445 .map(|items| self.create_batch(items))
446 .collect()
447 }
448
449 fn create_batch(&self, items: Vec<TextItem>) -> Batch {
451 let id = self.next_id.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
452
453 let priority = items
454 .iter()
455 .map(|item| &item.priority)
456 .max()
457 .cloned()
458 .unwrap_or(TextPriority::Low);
459
460 let estimated_chars = items
461 .iter()
462 .map(|item| item.text.len())
463 .sum();
464
465 Batch {
466 id,
467 items,
468 priority,
469 estimated_chars,
470 created_at: Instant::now(),
471 }
472 }
473
474 fn sort_by_priority(&self, mut batches: Vec<Batch>) -> Vec<Batch> {
476 batches.sort_by(|a, b| {
477 b.priority.cmp(&a.priority)
478 .then_with(|| a.created_at.cmp(&b.created_at))
479 });
480 batches
481 }
482}
483
484impl Default for BatchManager {
485 fn default() -> Self {
486 Self::new()
487 }
488}
489
490trait Pipe: Sized {
496 fn pipe<F, R>(self, f: F) -> R
497 where
498 F: FnOnce(Self) -> R;
499}
500
501impl<T> Pipe for T {
502 fn pipe<F, R>(self, f: F) -> R
503 where
504 F: FnOnce(Self) -> R,
505 {
506 f(self)
507 }
508}
509
510pub fn create_text_item(text: String, location: String) -> TextItem {
516 let filter = TextFilter::new();
517 let analysis = filter.analyze_text(&text);
518
519 TextItem {
520 complexity: analysis.translatability_score,
521 text,
522 text_type: analysis.text_type,
523 priority: analysis.priority,
524 location,
525 }
526}
527
528pub fn batch_analyze_texts(texts: &[String]) -> Vec<TextAnalysis> {
530 let filter = TextFilter::new();
531
532 #[cfg(feature = "translation")]
533 {
534 texts
535 .par_iter()
536 .map(|text| filter.analyze_text(text))
537 .collect()
538 }
539
540 #[cfg(not(feature = "translation"))]
541 {
542 texts
543 .iter()
544 .map(|text| filter.analyze_text(text))
545 .collect()
546 }
547}
548
549pub fn create_optimized_batches(items: Vec<TextItem>) -> Vec<Batch> {
551 let manager = BatchManager::new();
552 manager.create_batches(items)
553}
554
555#[cfg(test)]
560mod tests {
561 use super::*;
562
563 #[test]
564 fn test_text_filter_basic() {
565 let filter = TextFilter::new();
566
567 assert!(filter.should_translate("Hello World"));
568 assert!(!filter.should_translate(""));
569 assert!(!filter.should_translate(" "));
570 assert!(!filter.should_translate("123"));
571 assert!(!filter.should_translate("https://example.com"));
572 assert!(!filter.should_translate("test@example.com"));
573 }
574
575 #[test]
576 fn test_analyzers() {
577 use analyzers::*;
578
579 assert!(check_length("Hello"));
580 assert!(!check_length("H"));
581 assert!(!check_length(""));
582
583 assert!(is_whitespace_only(" "));
584 assert!(!is_whitespace_only("Hello"));
585
586 assert!(is_numeric_only("123"));
587 assert!(!is_numeric_only("123abc"));
588
589 assert!(is_url("https://example.com"));
590 assert!(!is_url("hello world"));
591
592 assert!(is_email("test@example.com"));
593 assert!(!is_email("not an email"));
594 }
595
596 #[test]
597 fn test_batch_manager() {
598 let manager = BatchManager::new();
599 let items = vec![
600 create_text_item("Hello World, this is a longer text".to_string(), "p1".to_string()),
601 create_text_item("Another longer text for testing".to_string(), "p2".to_string()),
602 create_text_item("Third longer text item".to_string(), "p3".to_string()),
603 create_text_item("Fourth longer text item".to_string(), "p4".to_string()),
604 create_text_item("Fifth longer text item".to_string(), "p5".to_string()),
605 ];
606
607 let batches = manager.create_batches(items);
608 assert!(!batches.is_empty());
609 }
610
611 #[test]
612 fn test_text_analysis() {
613 let filter = TextFilter::new();
614 let analysis = filter.analyze_text("Hello World");
615
616 assert!(analysis.should_translate);
617 assert!(analysis.translatability_score > 0.5);
618 assert!(matches!(analysis.text_type, TextType::Button | TextType::Label));
620 }
621}