markdown_translator/
functional.rs

1//! 函数式翻译处理模块
2//!
3//! 这个模块提供了函数式编程风格的翻译文本处理功能,包括:
4//! - 纯函数式的文本过滤和分析
5//! - 函数式批次管理和组合
6//! - 不可变数据结构和函数组合
7//! - 错误处理的函数式管道
8//!
9//! ## 设计原则
10//!
11//! - **纯函数**: 所有核心函数都是纯函数,无副作用
12//! - **函数组合**: 使用组合子模式构建复杂逻辑
13//! - **不可变性**: 优先使用不可变数据结构
14//! - **函数式错误处理**: 使用 `Result` 类型和 `?` 操作符
15
16use std::collections::HashSet;
17use std::sync::LazyLock;
18use std::time::Instant;
19
20#[cfg(feature = "translation")]
21use rayon::prelude::*;
22use regex::Regex;
23
24// ============================================================================
25// 核心数据类型
26// ============================================================================
27
28/// 文本项 - 表示单个可翻译的文本单元
29#[derive(Debug, Clone, PartialEq)]
30pub struct TextItem {
31    /// 原始文本内容
32    pub text: String,
33    /// 文本类型
34    pub text_type: TextType,
35    /// 翻译优先级
36    pub priority: TextPriority,
37    /// 预估复杂度 (0.0-1.0)
38    pub complexity: f32,
39    /// 在DOM中的位置标识
40    pub location: String,
41}
42
43/// 文本类型枚举
44#[derive(Debug, Clone, PartialEq, Eq, Hash)]
45pub enum TextType {
46    Title,      // 标题
47    Content,    // 正文内容
48    Link,       // 链接文本
49    Button,     // 按钮文本
50    Label,      // 标签文本
51    Alt,        // 图片描述
52    Placeholder,// 占位符
53    Other,      // 其他
54}
55
56/// 文本优先级
57#[derive(Debug, Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
58pub enum TextPriority {
59    Low = 1,
60    Normal = 2,
61    High = 3,
62    Critical = 4,
63}
64
65/// 翻译批次
66#[derive(Debug, Clone)]
67pub struct Batch {
68    /// 批次ID
69    pub id: usize,
70    /// 包含的文本项
71    pub items: Vec<TextItem>,
72    /// 批次优先级
73    pub priority: TextPriority,
74    /// 预估字符总数
75    pub estimated_chars: usize,
76    /// 创建时间
77    pub created_at: Instant,
78}
79
80/// 文本分析结果
81#[derive(Debug, Clone)]
82pub struct TextAnalysis {
83    /// 是否应该翻译
84    pub should_translate: bool,
85    /// 可翻译性评分 (0.0-1.0)
86    pub translatability_score: f32,
87    /// 文本类型
88    pub text_type: TextType,
89    /// 优先级
90    pub priority: TextPriority,
91    /// 检测到的特征
92    pub features: Vec<String>,
93}
94
95// ============================================================================
96// 正则表达式缓存(函数式懒加载)
97// ============================================================================
98
99/// URL匹配正则表达式
100static URL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
101    Regex::new(r"https?://[^\s]+|www\.[^\s]+|[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
102        .expect("URL regex should be valid")
103});
104
105/// 邮箱匹配正则表达式
106static EMAIL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
107    Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
108        .expect("Email regex should be valid")
109});
110
111/// 代码片段匹配正则表达式
112static CODE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
113    Regex::new(r"^\s*[{}\[\]();,]|[=+\-*/%<>!&|^~]|\b(function|var|let|const|if|else|for|while|return|class|def|import|export)\b")
114        .expect("Code regex should be valid")
115});
116
117/// 中文字符匹配正则表达式
118static CHINESE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
119    Regex::new(r"[\u4e00-\u9fff]")
120        .expect("Chinese regex should be valid")
121});
122
123/// 功能性词汇集合
124static FUNCTIONAL_WORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
125    [
126        "ok", "yes", "no", "on", "off", "true", "false", "null", "none",
127        "home", "back", "next", "prev", "close", "open", "save", "load",
128        "new", "edit", "delete", "add", "remove", "clear", "reset",
129        "login", "logout", "signup", "signin", "submit", "cancel",
130        "confirm", "apply", "ok", "done", "finish", "start", "stop",
131        "play", "pause", "resume", "skip", "retry", "refresh", "reload",
132        "search", "filter", "sort", "view", "show", "hide", "toggle",
133        "expand", "collapse", "minimize", "maximize", "restore",
134        "copy", "paste", "cut", "undo", "redo", "select", "all",
135        "help", "info", "about", "contact", "privacy", "terms",
136        "settings", "config", "options", "preferences", "profile",
137        "account", "user", "admin", "guest", "public", "private",
138        "draft", "published", "archived", "deleted", "active", "inactive",
139        "enabled", "disabled", "online", "offline", "available", "busy",
140        "free", "premium", "pro", "basic", "standard", "advanced",
141        "low", "medium", "high", "max", "min", "auto", "manual",
142    ].into_iter().collect()
143});
144
145// ============================================================================
146// 纯函数式文本分析器
147// ============================================================================
148
149/// 文本分析的函数式组合子
150pub mod analyzers {
151    use super::*;
152
153    /// 基础长度检查
154    pub fn check_length(text: &str) -> bool {
155        let trimmed = text.trim();
156        !trimmed.is_empty() && trimmed.len() >= 2
157    }
158
159    /// 检查是否为纯空白
160    pub fn is_whitespace_only(text: &str) -> bool {
161        text.trim().is_empty()
162    }
163
164    /// 检查是否为纯数字
165    pub fn is_numeric_only(text: &str) -> bool {
166        text.trim().chars().all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ')
167    }
168
169    /// 检查是否为URL
170    pub fn is_url(text: &str) -> bool {
171        URL_REGEX.is_match(text.trim())
172    }
173
174    /// 检查是否为邮箱
175    pub fn is_email(text: &str) -> bool {
176        EMAIL_REGEX.is_match(text.trim())
177    }
178
179    /// 检查是否为代码片段
180    pub fn is_code(text: &str) -> bool {
181        let trimmed = text.trim();
182        CODE_REGEX.is_match(trimmed) || 
183        trimmed.starts_with("function") ||
184        trimmed.contains("=>") ||
185        (trimmed.contains('{') && trimmed.contains('}'))
186    }
187
188    /// 计算中文字符比例
189    pub fn chinese_char_ratio(text: &str) -> f32 {
190        let total_chars = text.chars().count();
191        if total_chars == 0 {
192            return 0.0;
193        }
194        
195        let chinese_chars = CHINESE_REGEX.find_iter(text).count();
196        chinese_chars as f32 / total_chars as f32
197    }
198
199    /// 检查是否为功能性词汇
200    pub fn is_functional_word(text: &str) -> bool {
201        let trimmed = text.trim().to_ascii_lowercase();
202        FUNCTIONAL_WORDS.contains(trimmed.as_str())
203    }
204
205    /// 计算字母字符比例
206    pub fn alphabetic_ratio(text: &str) -> f32 {
207        let total_chars = text.chars().count();
208        if total_chars == 0 {
209            return 0.0;
210        }
211        
212        let alphabetic_chars = text.chars().filter(|c| c.is_alphabetic()).count();
213        alphabetic_chars as f32 / total_chars as f32
214    }
215
216    /// 计算特殊字符密度
217    pub fn special_char_density(text: &str) -> f32 {
218        let total_chars = text.chars().count();
219        if total_chars == 0 {
220            return 0.0;
221        }
222        
223        let special_chars = text.chars()
224            .filter(|c| !c.is_alphanumeric() && !c.is_whitespace())
225            .count();
226        special_chars as f32 / total_chars as f32
227    }
228}
229
230// ============================================================================
231// 函数式文本过滤器
232// ============================================================================
233
234/// 纯函数式文本过滤器
235pub struct TextFilter;
236
237impl TextFilter {
238    pub fn new() -> Self {
239        Self
240    }
241
242    /// 判断文本是否应该翻译(主要入口函数)
243    pub fn should_translate(&self, text: &str) -> bool {
244        self.analyze_text(text).should_translate
245    }
246
247    /// 全面分析文本
248    pub fn analyze_text(&self, text: &str) -> TextAnalysis {
249        use analyzers::*;
250
251        // 基础检查链
252        if !check_length(text) || is_whitespace_only(text) {
253            return TextAnalysis {
254                should_translate: false,
255                translatability_score: 0.0,
256                text_type: TextType::Other,
257                priority: TextPriority::Low,
258                features: vec!["too_short".to_string()],
259            };
260        }
261
262        // 内容类型检查链
263        let mut features = Vec::new();
264        let mut score = 1.0f32;
265
266        if is_numeric_only(text) {
267            features.push("numeric".to_string());
268            score *= 0.1;
269        }
270
271        if is_url(text) {
272            features.push("url".to_string());
273            score *= 0.0;
274        }
275
276        if is_email(text) {
277            features.push("email".to_string());
278            score *= 0.0;
279        }
280
281        if is_code(text) {
282            features.push("code".to_string());
283            score *= 0.2;
284        }
285
286        if is_functional_word(text) {
287            features.push("functional".to_string());
288            score *= 0.3;
289        }
290
291        // 语言特征分析
292        let chinese_ratio = chinese_char_ratio(text);
293        if chinese_ratio > 0.3 {
294            features.push("chinese".to_string());
295            score *= 0.1; // 已经是中文,不需要翻译
296        }
297
298        let alphabetic_ratio = alphabetic_ratio(text);
299        if alphabetic_ratio < 0.3 {
300            features.push("low_alphabetic".to_string());
301            score *= 0.5;
302        }
303
304        let special_density = special_char_density(text);
305        if special_density > 0.5 {
306            features.push("high_special_chars".to_string());
307            score *= 0.4;
308        }
309
310        // 确定文本类型和优先级
311        let text_type = self.infer_text_type(text);
312        let priority = self.infer_priority(&text_type, text.len());
313
314        TextAnalysis {
315            should_translate: score > 0.5,
316            translatability_score: score,
317            text_type,
318            priority,
319            features,
320        }
321    }
322
323    /// 推断文本类型
324    fn infer_text_type(&self, text: &str) -> TextType {
325        let len = text.len();
326        let has_punctuation = text.chars().any(|c| ".!?。!?".contains(c));
327        
328        if len < 10 && !has_punctuation {
329            TextType::Label
330        } else if len < 50 && !has_punctuation {
331            TextType::Button
332        } else if has_punctuation && len > 20 {
333            TextType::Content
334        } else {
335            TextType::Other
336        }
337    }
338
339    /// 推断优先级
340    fn infer_priority(&self, text_type: &TextType, length: usize) -> TextPriority {
341        match text_type {
342            TextType::Title => TextPriority::Critical,
343            TextType::Content if length > 100 => TextPriority::High,
344            TextType::Content => TextPriority::Normal,
345            TextType::Button | TextType::Link => TextPriority::High,
346            TextType::Label | TextType::Alt => TextPriority::Normal,
347            TextType::Placeholder => TextPriority::Low,
348            TextType::Other => TextPriority::Low,
349        }
350    }
351
352    /// 批量过滤文本(函数式并行处理)
353    #[cfg(feature = "translation")]
354    pub fn filter_texts_parallel(&self, texts: Vec<String>) -> Vec<String> {
355        texts
356            .into_par_iter()
357            .filter(|text| self.should_translate(text))
358            .collect()
359    }
360
361    /// 批量过滤文本(串行版本)
362    #[cfg(not(feature = "translation"))]
363    pub fn filter_texts_parallel(&self, texts: Vec<String>) -> Vec<String> {
364        texts
365            .into_iter()
366            .filter(|text| self.should_translate(text))
367            .collect()
368    }
369}
370
371impl Default for TextFilter {
372    fn default() -> Self {
373        Self::new()
374    }
375}
376
377// ============================================================================
378// 函数式批次管理器
379// ============================================================================
380
381/// 纯函数式批次管理器
382pub struct BatchManager {
383    next_id: std::sync::atomic::AtomicUsize,
384}
385
386impl BatchManager {
387    pub fn new() -> Self {
388        Self {
389            next_id: std::sync::atomic::AtomicUsize::new(1),
390        }
391    }
392
393    /// 创建优化的批次(主要入口函数)
394    pub fn create_batches(&self, items: Vec<TextItem>) -> Vec<Batch> {
395        // 函数式管道处理
396        items
397            .into_iter()
398            .filter(|item| !item.text.trim().is_empty())
399            .collect::<Vec<_>>()
400            .pipe(|items| self.group_by_priority(items))
401            .pipe(|groups| self.optimize_batch_sizes(groups))
402            .pipe(|batches| self.sort_by_priority(batches))
403    }
404
405    /// 按优先级分组
406    fn group_by_priority(&self, items: Vec<TextItem>) -> Vec<Vec<TextItem>> {
407        use std::collections::HashMap;
408        
409        let mut groups: HashMap<TextPriority, Vec<TextItem>> = HashMap::new();
410        
411        for item in items {
412            groups.entry(item.priority.clone()).or_default().push(item);
413        }
414        
415        // 按优先级排序返回
416        let mut result: Vec<_> = groups.into_values().collect();
417        result.sort_by(|a, b| {
418            b.first().map(|item| &item.priority)
419                .cmp(&a.first().map(|item| &item.priority))
420        });
421        
422        result
423    }
424
425    /// 优化批次大小
426    fn optimize_batch_sizes(&self, groups: Vec<Vec<TextItem>>) -> Vec<Batch> {
427        const MAX_BATCH_SIZE: usize = 50;
428        const MIN_BATCH_SIZE: usize = 5;
429        
430        groups
431            .into_iter()
432            .flat_map(|group| {
433                if group.len() <= MAX_BATCH_SIZE {
434                    vec![group]
435                } else {
436                    // 拆分大组
437                    group
438                        .chunks(MAX_BATCH_SIZE)
439                        .map(|chunk| chunk.to_vec())
440                        .collect()
441                }
442            })
443            .filter(|group| group.len() >= MIN_BATCH_SIZE || 
444                    group.iter().any(|item| item.priority >= TextPriority::High))
445            .map(|items| self.create_batch(items))
446            .collect()
447    }
448
449    /// 创建单个批次
450    fn create_batch(&self, items: Vec<TextItem>) -> Batch {
451        let id = self.next_id.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
452        
453        let priority = items
454            .iter()
455            .map(|item| &item.priority)
456            .max()
457            .cloned()
458            .unwrap_or(TextPriority::Low);
459        
460        let estimated_chars = items
461            .iter()
462            .map(|item| item.text.len())
463            .sum();
464        
465        Batch {
466            id,
467            items,
468            priority,
469            estimated_chars,
470            created_at: Instant::now(),
471        }
472    }
473
474    /// 按优先级排序批次
475    fn sort_by_priority(&self, mut batches: Vec<Batch>) -> Vec<Batch> {
476        batches.sort_by(|a, b| {
477            b.priority.cmp(&a.priority)
478                .then_with(|| a.created_at.cmp(&b.created_at))
479        });
480        batches
481    }
482}
483
484impl Default for BatchManager {
485    fn default() -> Self {
486        Self::new()
487    }
488}
489
490// ============================================================================
491// 函数式管道操作符
492// ============================================================================
493
494/// 函数式管道操作符 trait
495trait Pipe: Sized {
496    fn pipe<F, R>(self, f: F) -> R
497    where
498        F: FnOnce(Self) -> R;
499}
500
501impl<T> Pipe for T {
502    fn pipe<F, R>(self, f: F) -> R
503    where
504        F: FnOnce(Self) -> R,
505    {
506        f(self)
507    }
508}
509
510// ============================================================================
511// 便利函数
512// ============================================================================
513
514/// 快速创建文本项
515pub fn create_text_item(text: String, location: String) -> TextItem {
516    let filter = TextFilter::new();
517    let analysis = filter.analyze_text(&text);
518    
519    TextItem {
520        complexity: analysis.translatability_score,
521        text,
522        text_type: analysis.text_type,
523        priority: analysis.priority,
524        location,
525    }
526}
527
528/// 快速批量分析文本
529pub fn batch_analyze_texts(texts: &[String]) -> Vec<TextAnalysis> {
530    let filter = TextFilter::new();
531    
532    #[cfg(feature = "translation")]
533    {
534        texts
535            .par_iter()
536            .map(|text| filter.analyze_text(text))
537            .collect()
538    }
539    
540    #[cfg(not(feature = "translation"))]
541    {
542        texts
543            .iter()
544            .map(|text| filter.analyze_text(text))
545            .collect()
546    }
547}
548
549/// 快速创建批次管理器并处理文本项
550pub fn create_optimized_batches(items: Vec<TextItem>) -> Vec<Batch> {
551    let manager = BatchManager::new();
552    manager.create_batches(items)
553}
554
555// ============================================================================
556// 测试模块
557// ============================================================================
558
559#[cfg(test)]
560mod tests {
561    use super::*;
562
563    #[test]
564    fn test_text_filter_basic() {
565        let filter = TextFilter::new();
566        
567        assert!(filter.should_translate("Hello World"));
568        assert!(!filter.should_translate(""));
569        assert!(!filter.should_translate("   "));
570        assert!(!filter.should_translate("123"));
571        assert!(!filter.should_translate("https://example.com"));
572        assert!(!filter.should_translate("test@example.com"));
573    }
574
575    #[test]
576    fn test_analyzers() {
577        use analyzers::*;
578        
579        assert!(check_length("Hello"));
580        assert!(!check_length("H"));
581        assert!(!check_length(""));
582        
583        assert!(is_whitespace_only("   "));
584        assert!(!is_whitespace_only("Hello"));
585        
586        assert!(is_numeric_only("123"));
587        assert!(!is_numeric_only("123abc"));
588        
589        assert!(is_url("https://example.com"));
590        assert!(!is_url("hello world"));
591        
592        assert!(is_email("test@example.com"));
593        assert!(!is_email("not an email"));
594    }
595
596    #[test]
597    fn test_batch_manager() {
598        let manager = BatchManager::new();
599        let items = vec![
600            create_text_item("Hello World, this is a longer text".to_string(), "p1".to_string()),
601            create_text_item("Another longer text for testing".to_string(), "p2".to_string()),
602            create_text_item("Third longer text item".to_string(), "p3".to_string()),
603            create_text_item("Fourth longer text item".to_string(), "p4".to_string()),
604            create_text_item("Fifth longer text item".to_string(), "p5".to_string()),
605        ];
606        
607        let batches = manager.create_batches(items);
608        assert!(!batches.is_empty());
609    }
610
611    #[test]
612    fn test_text_analysis() {
613        let filter = TextFilter::new();
614        let analysis = filter.analyze_text("Hello World");
615        
616        assert!(analysis.should_translate);
617        assert!(analysis.translatability_score > 0.5);
618        // 文本类型的推断可能是Button或Label,都是正确的
619        assert!(matches!(analysis.text_type, TextType::Button | TextType::Label));
620    }
621}