markdown-translator 0.1.1

//! 函数式翻译处理模块
//!
//! 这个模块提供了函数式编程风格的翻译文本处理功能，包括：
//! - 纯函数式的文本过滤和分析
//! - 函数式批次管理和组合
//! - 不可变数据结构和函数组合
//! - 错误处理的函数式管道
//!
//! ## 设计原则
//!
//! - **纯函数**: 所有核心函数都是纯函数，无副作用
//! - **函数组合**: 使用组合子模式构建复杂逻辑
//! - **不可变性**: 优先使用不可变数据结构
//! - **函数式错误处理**: 使用 `Result` 类型和 `?` 操作符

use std::collections::HashSet;
use std::sync::LazyLock;
use std::time::Instant;

#[cfg(feature = "translation")]
use rayon::prelude::*;
use regex::Regex;

// ============================================================================
// 核心数据类型
// ============================================================================

/// 文本项 - 表示单个可翻译的文本单元
#[derive(Debug, Clone, PartialEq)]
pub struct TextItem {
    /// 原始文本内容
    pub text: String,
    /// 文本类型
    pub text_type: TextType,
    /// 翻译优先级
    pub priority: TextPriority,
    /// 预估复杂度 (0.0-1.0)
    pub complexity: f32,
    /// 在DOM中的位置标识
    pub location: String,
}

/// 文本类型枚举
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum TextType {
    Title,      // 标题
    Content,    // 正文内容
    Link,       // 链接文本
    Button,     // 按钮文本
    Label,      // 标签文本
    Alt,        // 图片描述
    Placeholder,// 占位符
    Other,      // 其他
}

/// 文本优先级
#[derive(Debug, Clone, PartialEq, PartialOrd, Ord, Eq, Hash)]
pub enum TextPriority {
    Low = 1,
    Normal = 2,
    High = 3,
    Critical = 4,
}

/// 翻译批次
#[derive(Debug, Clone)]
pub struct Batch {
    /// 批次ID
    pub id: usize,
    /// 包含的文本项
    pub items: Vec<TextItem>,
    /// 批次优先级
    pub priority: TextPriority,
    /// 预估字符总数
    pub estimated_chars: usize,
    /// 创建时间
    pub created_at: Instant,
}

/// 文本分析结果
#[derive(Debug, Clone)]
pub struct TextAnalysis {
    /// 是否应该翻译
    pub should_translate: bool,
    /// 可翻译性评分 (0.0-1.0)
    pub translatability_score: f32,
    /// 文本类型
    pub text_type: TextType,
    /// 优先级
    pub priority: TextPriority,
    /// 检测到的特征
    pub features: Vec<String>,
}

// ============================================================================
// 正则表达式缓存（函数式懒加载）
// ============================================================================

/// URL匹配正则表达式
static URL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"https?://[^\s]+|www\.[^\s]+|[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
        .expect("URL regex should be valid")
});

/// 邮箱匹配正则表达式
static EMAIL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
        .expect("Email regex should be valid")
});

/// 代码片段匹配正则表达式
static CODE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"^\s*[{}\[\]();,]|[=+\-*/%<>!&|^~]|\b(function|var|let|const|if|else|for|while|return|class|def|import|export)\b")
        .expect("Code regex should be valid")
});

/// 中文字符匹配正则表达式
static CHINESE_REGEX: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"[\u4e00-\u9fff]")
        .expect("Chinese regex should be valid")
});

/// 功能性词汇集合
static FUNCTIONAL_WORDS: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
    [
        "ok", "yes", "no", "on", "off", "true", "false", "null", "none",
        "home", "back", "next", "prev", "close", "open", "save", "load",
        "new", "edit", "delete", "add", "remove", "clear", "reset",
        "login", "logout", "signup", "signin", "submit", "cancel",
        "confirm", "apply", "ok", "done", "finish", "start", "stop",
        "play", "pause", "resume", "skip", "retry", "refresh", "reload",
        "search", "filter", "sort", "view", "show", "hide", "toggle",
        "expand", "collapse", "minimize", "maximize", "restore",
        "copy", "paste", "cut", "undo", "redo", "select", "all",
        "help", "info", "about", "contact", "privacy", "terms",
        "settings", "config", "options", "preferences", "profile",
        "account", "user", "admin", "guest", "public", "private",
        "draft", "published", "archived", "deleted", "active", "inactive",
        "enabled", "disabled", "online", "offline", "available", "busy",
        "free", "premium", "pro", "basic", "standard", "advanced",
        "low", "medium", "high", "max", "min", "auto", "manual",
    ].into_iter().collect()
});

// ============================================================================
// 纯函数式文本分析器
// ============================================================================

/// 文本分析的函数式组合子
pub mod analyzers {
    use super::*;

    /// 基础长度检查
    pub fn check_length(text: &str) -> bool {
        let trimmed = text.trim();
        !trimmed.is_empty() && trimmed.len() >= 2
    }

    /// 检查是否为纯空白
    pub fn is_whitespace_only(text: &str) -> bool {
        text.trim().is_empty()
    }

    /// 检查是否为纯数字
    pub fn is_numeric_only(text: &str) -> bool {
        text.trim().chars().all(|c| c.is_ascii_digit() || c == '.' || c == ',' || c == ' ')
    }

    /// 检查是否为URL
    pub fn is_url(text: &str) -> bool {
        URL_REGEX.is_match(text.trim())
    }

    /// 检查是否为邮箱
    pub fn is_email(text: &str) -> bool {
        EMAIL_REGEX.is_match(text.trim())
    }

    /// 检查是否为代码片段
    pub fn is_code(text: &str) -> bool {
        let trimmed = text.trim();
        CODE_REGEX.is_match(trimmed) || 
        trimmed.starts_with("function") ||
        trimmed.contains("=>") ||
        (trimmed.contains('{') && trimmed.contains('}'))
    }

    /// 计算中文字符比例
    pub fn chinese_char_ratio(text: &str) -> f32 {
        let total_chars = text.chars().count();
        if total_chars == 0 {
            return 0.0;
        }
        
        let chinese_chars = CHINESE_REGEX.find_iter(text).count();
        chinese_chars as f32 / total_chars as f32
    }

    /// 检查是否为功能性词汇
    pub fn is_functional_word(text: &str) -> bool {
        let trimmed = text.trim().to_ascii_lowercase();
        FUNCTIONAL_WORDS.contains(trimmed.as_str())
    }

    /// 计算字母字符比例
    pub fn alphabetic_ratio(text: &str) -> f32 {
        let total_chars = text.chars().count();
        if total_chars == 0 {
            return 0.0;
        }
        
        let alphabetic_chars = text.chars().filter(|c| c.is_alphabetic()).count();
        alphabetic_chars as f32 / total_chars as f32
    }

    /// 计算特殊字符密度
    pub fn special_char_density(text: &str) -> f32 {
        let total_chars = text.chars().count();
        if total_chars == 0 {
            return 0.0;
        }
        
        let special_chars = text.chars()
            .filter(|c| !c.is_alphanumeric() && !c.is_whitespace())
            .count();
        special_chars as f32 / total_chars as f32
    }
}

// ============================================================================
// 函数式文本过滤器
// ============================================================================

/// 纯函数式文本过滤器
pub struct TextFilter;

impl TextFilter {
    pub fn new() -> Self {
        Self
    }

    /// 判断文本是否应该翻译（主要入口函数）
    pub fn should_translate(&self, text: &str) -> bool {
        self.analyze_text(text).should_translate
    }

    /// 全面分析文本
    pub fn analyze_text(&self, text: &str) -> TextAnalysis {
        use analyzers::*;

        // 基础检查链
        if !check_length(text) || is_whitespace_only(text) {
            return TextAnalysis {
                should_translate: false,
                translatability_score: 0.0,
                text_type: TextType::Other,
                priority: TextPriority::Low,
                features: vec!["too_short".to_string()],
            };
        }

        // 内容类型检查链
        let mut features = Vec::new();
        let mut score = 1.0f32;

        if is_numeric_only(text) {
            features.push("numeric".to_string());
            score *= 0.1;
        }

        if is_url(text) {
            features.push("url".to_string());
            score *= 0.0;
        }

        if is_email(text) {
            features.push("email".to_string());
            score *= 0.0;
        }

        if is_code(text) {
            features.push("code".to_string());
            score *= 0.2;
        }

        if is_functional_word(text) {
            features.push("functional".to_string());
            score *= 0.3;
        }

        // 语言特征分析
        let chinese_ratio = chinese_char_ratio(text);
        if chinese_ratio > 0.3 {
            features.push("chinese".to_string());
            score *= 0.1; // 已经是中文，不需要翻译
        }

        let alphabetic_ratio = alphabetic_ratio(text);
        if alphabetic_ratio < 0.3 {
            features.push("low_alphabetic".to_string());
            score *= 0.5;
        }

        let special_density = special_char_density(text);
        if special_density > 0.5 {
            features.push("high_special_chars".to_string());
            score *= 0.4;
        }

        // 确定文本类型和优先级
        let text_type = self.infer_text_type(text);
        let priority = self.infer_priority(&text_type, text.len());

        TextAnalysis {
            should_translate: score > 0.5,
            translatability_score: score,
            text_type,
            priority,
            features,
        }
    }

    /// 推断文本类型
    fn infer_text_type(&self, text: &str) -> TextType {
        let len = text.len();
        let has_punctuation = text.chars().any(|c| ".!?。！？".contains(c));
        
        if len < 10 && !has_punctuation {
            TextType::Label
        } else if len < 50 && !has_punctuation {
            TextType::Button
        } else if has_punctuation && len > 20 {
            TextType::Content
        } else {
            TextType::Other
        }
    }

    /// 推断优先级
    fn infer_priority(&self, text_type: &TextType, length: usize) -> TextPriority {
        match text_type {
            TextType::Title => TextPriority::Critical,
            TextType::Content if length > 100 => TextPriority::High,
            TextType::Content => TextPriority::Normal,
            TextType::Button | TextType::Link => TextPriority::High,
            TextType::Label | TextType::Alt => TextPriority::Normal,
            TextType::Placeholder => TextPriority::Low,
            TextType::Other => TextPriority::Low,
        }
    }

    /// 批量过滤文本（函数式并行处理）
    #[cfg(feature = "translation")]
    pub fn filter_texts_parallel(&self, texts: Vec<String>) -> Vec<String> {
        texts
            .into_par_iter()
            .filter(|text| self.should_translate(text))
            .collect()
    }

    /// 批量过滤文本（串行版本）
    #[cfg(not(feature = "translation"))]
    pub fn filter_texts_parallel(&self, texts: Vec<String>) -> Vec<String> {
        texts
            .into_iter()
            .filter(|text| self.should_translate(text))
            .collect()
    }
}

impl Default for TextFilter {
    fn default() -> Self {
        Self::new()
    }
}

// ============================================================================
// 函数式批次管理器
// ============================================================================

/// 纯函数式批次管理器
pub struct BatchManager {
    next_id: std::sync::atomic::AtomicUsize,
}

impl BatchManager {
    pub fn new() -> Self {
        Self {
            next_id: std::sync::atomic::AtomicUsize::new(1),
        }
    }

    /// 创建优化的批次（主要入口函数）
    pub fn create_batches(&self, items: Vec<TextItem>) -> Vec<Batch> {
        // 函数式管道处理
        items
            .into_iter()
            .filter(|item| !item.text.trim().is_empty())
            .collect::<Vec<_>>()
            .pipe(|items| self.group_by_priority(items))
            .pipe(|groups| self.optimize_batch_sizes(groups))
            .pipe(|batches| self.sort_by_priority(batches))
    }

    /// 按优先级分组
    fn group_by_priority(&self, items: Vec<TextItem>) -> Vec<Vec<TextItem>> {
        use std::collections::HashMap;
        
        let mut groups: HashMap<TextPriority, Vec<TextItem>> = HashMap::new();
        
        for item in items {
            groups.entry(item.priority.clone()).or_default().push(item);
        }
        
        // 按优先级排序返回
        let mut result: Vec<_> = groups.into_values().collect();
        result.sort_by(|a, b| {
            b.first().map(|item| &item.priority)
                .cmp(&a.first().map(|item| &item.priority))
        });
        
        result
    }

    /// 优化批次大小
    fn optimize_batch_sizes(&self, groups: Vec<Vec<TextItem>>) -> Vec<Batch> {
        const MAX_BATCH_SIZE: usize = 50;
        const MIN_BATCH_SIZE: usize = 5;
        
        groups
            .into_iter()
            .flat_map(|group| {
                if group.len() <= MAX_BATCH_SIZE {
                    vec![group]
                } else {
                    // 拆分大组
                    group
                        .chunks(MAX_BATCH_SIZE)
                        .map(|chunk| chunk.to_vec())
                        .collect()
                }
            })
            .filter(|group| group.len() >= MIN_BATCH_SIZE || 
                    group.iter().any(|item| item.priority >= TextPriority::High))
            .map(|items| self.create_batch(items))
            .collect()
    }

    /// 创建单个批次
    fn create_batch(&self, items: Vec<TextItem>) -> Batch {
        let id = self.next_id.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
        
        let priority = items
            .iter()
            .map(|item| &item.priority)
            .max()
            .cloned()
            .unwrap_or(TextPriority::Low);
        
        let estimated_chars = items
            .iter()
            .map(|item| item.text.len())
            .sum();
        
        Batch {
            id,
            items,
            priority,
            estimated_chars,
            created_at: Instant::now(),
        }
    }

    /// 按优先级排序批次
    fn sort_by_priority(&self, mut batches: Vec<Batch>) -> Vec<Batch> {
        batches.sort_by(|a, b| {
            b.priority.cmp(&a.priority)
                .then_with(|| a.created_at.cmp(&b.created_at))
        });
        batches
    }
}

impl Default for BatchManager {
    fn default() -> Self {
        Self::new()
    }
}

// ============================================================================
// 函数式管道操作符
// ============================================================================

/// 函数式管道操作符 trait
trait Pipe: Sized {
    fn pipe<F, R>(self, f: F) -> R
    where
        F: FnOnce(Self) -> R;
}

impl<T> Pipe for T {
    fn pipe<F, R>(self, f: F) -> R
    where
        F: FnOnce(Self) -> R,
    {
        f(self)
    }
}

// ============================================================================
// 便利函数
// ============================================================================

/// 快速创建文本项
pub fn create_text_item(text: String, location: String) -> TextItem {
    let filter = TextFilter::new();
    let analysis = filter.analyze_text(&text);
    
    TextItem {
        complexity: analysis.translatability_score,
        text,
        text_type: analysis.text_type,
        priority: analysis.priority,
        location,
    }
}

/// 快速批量分析文本
pub fn batch_analyze_texts(texts: &[String]) -> Vec<TextAnalysis> {
    let filter = TextFilter::new();
    
    #[cfg(feature = "translation")]
    {
        texts
            .par_iter()
            .map(|text| filter.analyze_text(text))
            .collect()
    }
    
    #[cfg(not(feature = "translation"))]
    {
        texts
            .iter()
            .map(|text| filter.analyze_text(text))
            .collect()
    }
}

/// 快速创建批次管理器并处理文本项
pub fn create_optimized_batches(items: Vec<TextItem>) -> Vec<Batch> {
    let manager = BatchManager::new();
    manager.create_batches(items)
}

// ============================================================================
// 测试模块
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_text_filter_basic() {
        let filter = TextFilter::new();
        
        assert!(filter.should_translate("Hello World"));
        assert!(!filter.should_translate(""));
        assert!(!filter.should_translate("   "));
        assert!(!filter.should_translate("123"));
        assert!(!filter.should_translate("https://example.com"));
        assert!(!filter.should_translate("test@example.com"));
    }

    #[test]
    fn test_analyzers() {
        use analyzers::*;
        
        assert!(check_length("Hello"));
        assert!(!check_length("H"));
        assert!(!check_length(""));
        
        assert!(is_whitespace_only("   "));
        assert!(!is_whitespace_only("Hello"));
        
        assert!(is_numeric_only("123"));
        assert!(!is_numeric_only("123abc"));
        
        assert!(is_url("https://example.com"));
        assert!(!is_url("hello world"));
        
        assert!(is_email("test@example.com"));
        assert!(!is_email("not an email"));
    }

    #[test]
    fn test_batch_manager() {
        let manager = BatchManager::new();
        let items = vec![
            create_text_item("Hello World, this is a longer text".to_string(), "p1".to_string()),
            create_text_item("Another longer text for testing".to_string(), "p2".to_string()),
            create_text_item("Third longer text item".to_string(), "p3".to_string()),
            create_text_item("Fourth longer text item".to_string(), "p4".to_string()),
            create_text_item("Fifth longer text item".to_string(), "p5".to_string()),
        ];
        
        let batches = manager.create_batches(items);
        assert!(!batches.is_empty());
    }

    #[test]
    fn test_text_analysis() {
        let filter = TextFilter::new();
        let analysis = filter.analyze_text("Hello World");
        
        assert!(analysis.should_translate);
        assert!(analysis.translatability_score > 0.5);
        // 文本类型的推断可能是Button或Label，都是正确的
        assert!(matches!(analysis.text_type, TextType::Button | TextType::Label));
    }
}