markdown-translator 0.1.1

A translation library with DeepLX API integration, rate limiting, and smart text chunking
Documentation
//! 简化的DOM文本收集器模块
//!
//! 这个模块提供了函数式风格的DOM文本收集功能,使用迭代器和组合子模式
//! 来高效地从HTML文档中提取可翻译的文本内容。
//!
//! ## 设计原则
//!
//! - **函数式**: 使用纯函数和不可变数据结构
//! - **迭代器驱动**: 利用Rust迭代器的惰性求值和链式操作
//! - **最小化配置**: 简化配置选项,专注核心功能
//! - **高效过滤**: 集成智能文本过滤器

use std::collections::HashSet;

use crate::functional::{TextItem, TextType, TextPriority, TextFilter, create_text_item};

// ============================================================================
// DOM节点抽象
// ============================================================================

/// DOM节点的简化表示
/// 这里我们使用trait来抽象DOM节点,便于测试和不同DOM库的适配
pub trait DomNode {
    /// 获取节点文本内容
    fn text_content(&self) -> Option<String>;
    /// 获取节点标签名
    fn tag_name(&self) -> Option<String>;
    /// 获取属性值
    fn get_attribute(&self, name: &str) -> Option<String>;
    /// 获取子节点
    fn children(&self) -> Vec<Box<dyn DomNode>>;
    /// 是否为文本节点
    fn is_text_node(&self) -> bool;
}

// ============================================================================
// 文本收集器
// ============================================================================

/// 简化的文本收集器
/// 使用函数式编程风格,专注于核心的文本收集功能
pub struct TextCollector {
    filter: TextFilter,
    translatable_attributes: HashSet<&'static str>,
}

impl TextCollector {
    pub fn new() -> Self {
        Self {
            filter: TextFilter::new(),
            translatable_attributes: Self::default_translatable_attributes(),
        }
    }

    /// 默认的可翻译属性列表
    fn default_translatable_attributes() -> HashSet<&'static str> {
        [
            "title", "alt", "placeholder", "aria-label", "aria-describedby",
            "data-tooltip", "data-title", "value"
        ].into_iter().collect()
    }

    /// 从DOM节点收集可翻译文本(主要入口函数)
    pub fn collect_texts(&self, root: &dyn DomNode) -> Vec<TextItem> {
        self.collect_from_node(root, "root".to_string(), 0)
            .into_iter()
            .filter(|item| self.filter.should_translate(&item.text))
            .collect::<Vec<_>>()
            .pipe(|items| self.deduplicate_texts(items))
            .pipe(|items| self.sort_by_priority(items))
    }

    /// 从单个节点收集文本(递归函数)
    fn collect_from_node(&self, node: &dyn DomNode, location: String, depth: usize) -> Vec<TextItem> {
        let mut items = Vec::new();

        // 收集属性文本
        items.extend(self.collect_attribute_texts(node, &location));

        // 收集文本内容
        if node.is_text_node() {
            if let Some(text) = node.text_content() {
                let trimmed = text.trim();
                if !trimmed.is_empty() {
                    items.push(create_text_item(trimmed.to_string(), location.clone()));
                }
            }
        }

        // 递归收集子节点
        for (i, child) in node.children().iter().enumerate() {
            let child_location = format!("{}/child[{}]", location, i);
            items.extend(self.collect_from_node(child.as_ref(), child_location, depth + 1));
        }

        items
    }

    /// 收集元素属性中的文本
    fn collect_attribute_texts(&self, node: &dyn DomNode, location: &str) -> Vec<TextItem> {
        self.translatable_attributes
            .iter()
            .filter_map(|&attr_name| {
                node.get_attribute(attr_name).map(|value| {
                    let location = format!("{}@{}", location, attr_name);
                    let mut item = create_text_item(value, location);
                    
                    // 根据属性类型调整文本类型
                    item.text_type = match attr_name {
                        "title" | "data-title" => TextType::Title,
                        "alt" => TextType::Alt,
                        "placeholder" => TextType::Placeholder,
                        _ => TextType::Other,
                    };
                    
                    item
                })
            })
            .collect()
    }

    /// 去重相同的文本
    fn deduplicate_texts(&self, items: Vec<TextItem>) -> Vec<TextItem> {
        let mut seen = HashSet::new();
        items
            .into_iter()
            .filter(|item| {
                let key = item.text.trim().to_lowercase();
                seen.insert(key)
            })
            .collect()
    }

    /// 按优先级排序
    fn sort_by_priority(&self, mut items: Vec<TextItem>) -> Vec<TextItem> {
        items.sort_by(|a, b| {
            b.priority.cmp(&a.priority)
                .then_with(|| a.text.len().cmp(&b.text.len()))
        });
        items
    }
}

impl Default for TextCollector {
    fn default() -> Self {
        Self::new()
    }
}

// ============================================================================
// 函数式管道操作符
// ============================================================================

trait Pipe: Sized {
    fn pipe<F, R>(self, f: F) -> R
    where
        F: FnOnce(Self) -> R;
}

impl<T> Pipe for T {
    fn pipe<F, R>(self, f: F) -> R
    where
        F: FnOnce(Self) -> R,
    {
        f(self)
    }
}

// ============================================================================
// 便利函数
// ============================================================================

/// 快速收集文本的便利函数
pub fn collect_translatable_texts(root: &dyn DomNode) -> Vec<TextItem> {
    let collector = TextCollector::new();
    collector.collect_texts(root)
}

/// 按文本类型分组
pub fn group_texts_by_type(items: Vec<TextItem>) -> std::collections::HashMap<TextType, Vec<TextItem>> {
    let mut groups = std::collections::HashMap::new();
    
    for item in items {
        groups.entry(item.text_type.clone()).or_insert_with(Vec::new).push(item);
    }
    
    groups
}

/// 按优先级分组
pub fn group_texts_by_priority(items: Vec<TextItem>) -> std::collections::HashMap<TextPriority, Vec<TextItem>> {
    let mut groups = std::collections::HashMap::new();
    
    for item in items {
        let priority = item.priority.clone();
        groups.entry(priority).or_insert_with(Vec::new).push(item);
    }
    
    groups
}

// ============================================================================
// 测试用的简单DOM节点实现
// ============================================================================

#[cfg(test)]
pub struct TestDomNode {
    pub tag_name: Option<String>,
    pub text_content: Option<String>,
    pub attributes: std::collections::HashMap<String, String>,
    pub children: Vec<Box<dyn DomNode>>,
    pub is_text: bool,
}

#[cfg(test)]
impl TestDomNode {
    pub fn new_element(tag: &str) -> Self {
        Self {
            tag_name: Some(tag.to_string()),
            text_content: None,
            attributes: std::collections::HashMap::new(),
            children: Vec::new(),
            is_text: false,
        }
    }

    pub fn new_text(content: &str) -> Self {
        Self {
            tag_name: None,
            text_content: Some(content.to_string()),
            attributes: std::collections::HashMap::new(),
            children: Vec::new(),
            is_text: true,
        }
    }

    pub fn with_attribute(mut self, name: &str, value: &str) -> Self {
        self.attributes.insert(name.to_string(), value.to_string());
        self
    }

    pub fn with_child(mut self, child: TestDomNode) -> Self {
        self.children.push(Box::new(child));
        self
    }
}

#[cfg(test)]
impl DomNode for TestDomNode {
    fn text_content(&self) -> Option<String> {
        self.text_content.clone()
    }

    fn tag_name(&self) -> Option<String> {
        self.tag_name.clone()
    }

    fn get_attribute(&self, name: &str) -> Option<String> {
        self.attributes.get(name).cloned()
    }

    fn children(&self) -> Vec<Box<dyn DomNode>> {
        // 这里需要返回引用,但为了简化测试,我们返回克隆
        // 在实际应用中会使用更高效的实现
        self.children.iter().map(|child| {
            // 简化的克隆实现,仅用于测试
            Box::new(TestDomNode {
                tag_name: child.tag_name(),
                text_content: child.text_content(),
                attributes: std::collections::HashMap::new(), // 简化
                children: Vec::new(), // 简化
                is_text: child.is_text_node(),
            }) as Box<dyn DomNode>
        }).collect()
    }

    fn is_text_node(&self) -> bool {
        self.is_text
    }
}

// ============================================================================
// 测试
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_text_collector_basic() {
        let collector = TextCollector::new();
        
        let root = TestDomNode::new_element("div")
            .with_child(TestDomNode::new_text("Hello World"));
        
        let texts = collector.collect_texts(&root);
        
        assert_eq!(texts.len(), 1);
        assert_eq!(texts[0].text, "Hello World");
    }

    #[test]
    fn test_attribute_collection() {
        let collector = TextCollector::new();
        
        let root = TestDomNode::new_element("img")
            .with_attribute("alt", "Beautiful sunset")
            .with_attribute("title", "Sunset Photo");
        
        let texts = collector.collect_texts(&root);
        
        assert_eq!(texts.len(), 2);
        // 验证包含了alt和title属性
        let alt_text = texts.iter().find(|t| t.text == "Beautiful sunset").unwrap();
        assert_eq!(alt_text.text_type, TextType::Alt);
    }

    #[test]
    fn test_deduplication() {
        let collector = TextCollector::new();
        
        let root = TestDomNode::new_element("div")
            .with_child(TestDomNode::new_text("Same text"))
            .with_child(TestDomNode::new_text("Same text"))
            .with_child(TestDomNode::new_text("Different text"));
        
        let texts = collector.collect_texts(&root);
        
        // 应该去重,只剩下2个不同的文本
        assert_eq!(texts.len(), 2);
    }

    #[test]
    fn test_collect_translatable_texts_function() {
        let root = TestDomNode::new_element("p")
            .with_child(TestDomNode::new_text("This is a paragraph"));
        
        let texts = collect_translatable_texts(&root);
        assert!(!texts.is_empty());
    }

    #[test]
    fn test_grouping_functions() {
        let items = vec![
            create_text_item("Title text".to_string(), "h1".to_string()),
            create_text_item("Button text".to_string(), "button".to_string()),
        ];
        
        let grouped_by_type = group_texts_by_type(items.clone());
        assert!(!grouped_by_type.is_empty());
        
        let grouped_by_priority = group_texts_by_priority(items);
        assert!(!grouped_by_priority.is_empty());
    }
}