markdown_translator/
collector.rs

1//! 简化的DOM文本收集器模块
2//!
3//! 这个模块提供了函数式风格的DOM文本收集功能,使用迭代器和组合子模式
4//! 来高效地从HTML文档中提取可翻译的文本内容。
5//!
6//! ## 设计原则
7//!
8//! - **函数式**: 使用纯函数和不可变数据结构
9//! - **迭代器驱动**: 利用Rust迭代器的惰性求值和链式操作
10//! - **最小化配置**: 简化配置选项,专注核心功能
11//! - **高效过滤**: 集成智能文本过滤器
12
13use std::collections::HashSet;
14
15use crate::functional::{TextItem, TextType, TextPriority, TextFilter, create_text_item};
16
17// ============================================================================
18// DOM节点抽象
19// ============================================================================
20
21/// DOM节点的简化表示
22/// 这里我们使用trait来抽象DOM节点,便于测试和不同DOM库的适配
23pub trait DomNode {
24    /// 获取节点文本内容
25    fn text_content(&self) -> Option<String>;
26    /// 获取节点标签名
27    fn tag_name(&self) -> Option<String>;
28    /// 获取属性值
29    fn get_attribute(&self, name: &str) -> Option<String>;
30    /// 获取子节点
31    fn children(&self) -> Vec<Box<dyn DomNode>>;
32    /// 是否为文本节点
33    fn is_text_node(&self) -> bool;
34}
35
36// ============================================================================
37// 文本收集器
38// ============================================================================
39
40/// 简化的文本收集器
41/// 使用函数式编程风格,专注于核心的文本收集功能
42pub struct TextCollector {
43    filter: TextFilter,
44    translatable_attributes: HashSet<&'static str>,
45}
46
47impl TextCollector {
48    pub fn new() -> Self {
49        Self {
50            filter: TextFilter::new(),
51            translatable_attributes: Self::default_translatable_attributes(),
52        }
53    }
54
55    /// 默认的可翻译属性列表
56    fn default_translatable_attributes() -> HashSet<&'static str> {
57        [
58            "title", "alt", "placeholder", "aria-label", "aria-describedby",
59            "data-tooltip", "data-title", "value"
60        ].into_iter().collect()
61    }
62
63    /// 从DOM节点收集可翻译文本(主要入口函数)
64    pub fn collect_texts(&self, root: &dyn DomNode) -> Vec<TextItem> {
65        self.collect_from_node(root, "root".to_string(), 0)
66            .into_iter()
67            .filter(|item| self.filter.should_translate(&item.text))
68            .collect::<Vec<_>>()
69            .pipe(|items| self.deduplicate_texts(items))
70            .pipe(|items| self.sort_by_priority(items))
71    }
72
73    /// 从单个节点收集文本(递归函数)
74    fn collect_from_node(&self, node: &dyn DomNode, location: String, depth: usize) -> Vec<TextItem> {
75        let mut items = Vec::new();
76
77        // 收集属性文本
78        items.extend(self.collect_attribute_texts(node, &location));
79
80        // 收集文本内容
81        if node.is_text_node() {
82            if let Some(text) = node.text_content() {
83                let trimmed = text.trim();
84                if !trimmed.is_empty() {
85                    items.push(create_text_item(trimmed.to_string(), location.clone()));
86                }
87            }
88        }
89
90        // 递归收集子节点
91        for (i, child) in node.children().iter().enumerate() {
92            let child_location = format!("{}/child[{}]", location, i);
93            items.extend(self.collect_from_node(child.as_ref(), child_location, depth + 1));
94        }
95
96        items
97    }
98
99    /// 收集元素属性中的文本
100    fn collect_attribute_texts(&self, node: &dyn DomNode, location: &str) -> Vec<TextItem> {
101        self.translatable_attributes
102            .iter()
103            .filter_map(|&attr_name| {
104                node.get_attribute(attr_name).map(|value| {
105                    let location = format!("{}@{}", location, attr_name);
106                    let mut item = create_text_item(value, location);
107                    
108                    // 根据属性类型调整文本类型
109                    item.text_type = match attr_name {
110                        "title" | "data-title" => TextType::Title,
111                        "alt" => TextType::Alt,
112                        "placeholder" => TextType::Placeholder,
113                        _ => TextType::Other,
114                    };
115                    
116                    item
117                })
118            })
119            .collect()
120    }
121
122    /// 去重相同的文本
123    fn deduplicate_texts(&self, items: Vec<TextItem>) -> Vec<TextItem> {
124        let mut seen = HashSet::new();
125        items
126            .into_iter()
127            .filter(|item| {
128                let key = item.text.trim().to_lowercase();
129                seen.insert(key)
130            })
131            .collect()
132    }
133
134    /// 按优先级排序
135    fn sort_by_priority(&self, mut items: Vec<TextItem>) -> Vec<TextItem> {
136        items.sort_by(|a, b| {
137            b.priority.cmp(&a.priority)
138                .then_with(|| a.text.len().cmp(&b.text.len()))
139        });
140        items
141    }
142}
143
144impl Default for TextCollector {
145    fn default() -> Self {
146        Self::new()
147    }
148}
149
150// ============================================================================
151// 函数式管道操作符
152// ============================================================================
153
154trait Pipe: Sized {
155    fn pipe<F, R>(self, f: F) -> R
156    where
157        F: FnOnce(Self) -> R;
158}
159
160impl<T> Pipe for T {
161    fn pipe<F, R>(self, f: F) -> R
162    where
163        F: FnOnce(Self) -> R,
164    {
165        f(self)
166    }
167}
168
169// ============================================================================
170// 便利函数
171// ============================================================================
172
173/// 快速收集文本的便利函数
174pub fn collect_translatable_texts(root: &dyn DomNode) -> Vec<TextItem> {
175    let collector = TextCollector::new();
176    collector.collect_texts(root)
177}
178
179/// 按文本类型分组
180pub fn group_texts_by_type(items: Vec<TextItem>) -> std::collections::HashMap<TextType, Vec<TextItem>> {
181    let mut groups = std::collections::HashMap::new();
182    
183    for item in items {
184        groups.entry(item.text_type.clone()).or_insert_with(Vec::new).push(item);
185    }
186    
187    groups
188}
189
190/// 按优先级分组
191pub fn group_texts_by_priority(items: Vec<TextItem>) -> std::collections::HashMap<TextPriority, Vec<TextItem>> {
192    let mut groups = std::collections::HashMap::new();
193    
194    for item in items {
195        let priority = item.priority.clone();
196        groups.entry(priority).or_insert_with(Vec::new).push(item);
197    }
198    
199    groups
200}
201
202// ============================================================================
203// 测试用的简单DOM节点实现
204// ============================================================================
205
206#[cfg(test)]
207pub struct TestDomNode {
208    pub tag_name: Option<String>,
209    pub text_content: Option<String>,
210    pub attributes: std::collections::HashMap<String, String>,
211    pub children: Vec<Box<dyn DomNode>>,
212    pub is_text: bool,
213}
214
215#[cfg(test)]
216impl TestDomNode {
217    pub fn new_element(tag: &str) -> Self {
218        Self {
219            tag_name: Some(tag.to_string()),
220            text_content: None,
221            attributes: std::collections::HashMap::new(),
222            children: Vec::new(),
223            is_text: false,
224        }
225    }
226
227    pub fn new_text(content: &str) -> Self {
228        Self {
229            tag_name: None,
230            text_content: Some(content.to_string()),
231            attributes: std::collections::HashMap::new(),
232            children: Vec::new(),
233            is_text: true,
234        }
235    }
236
237    pub fn with_attribute(mut self, name: &str, value: &str) -> Self {
238        self.attributes.insert(name.to_string(), value.to_string());
239        self
240    }
241
242    pub fn with_child(mut self, child: TestDomNode) -> Self {
243        self.children.push(Box::new(child));
244        self
245    }
246}
247
248#[cfg(test)]
249impl DomNode for TestDomNode {
250    fn text_content(&self) -> Option<String> {
251        self.text_content.clone()
252    }
253
254    fn tag_name(&self) -> Option<String> {
255        self.tag_name.clone()
256    }
257
258    fn get_attribute(&self, name: &str) -> Option<String> {
259        self.attributes.get(name).cloned()
260    }
261
262    fn children(&self) -> Vec<Box<dyn DomNode>> {
263        // 这里需要返回引用,但为了简化测试,我们返回克隆
264        // 在实际应用中会使用更高效的实现
265        self.children.iter().map(|child| {
266            // 简化的克隆实现,仅用于测试
267            Box::new(TestDomNode {
268                tag_name: child.tag_name(),
269                text_content: child.text_content(),
270                attributes: std::collections::HashMap::new(), // 简化
271                children: Vec::new(), // 简化
272                is_text: child.is_text_node(),
273            }) as Box<dyn DomNode>
274        }).collect()
275    }
276
277    fn is_text_node(&self) -> bool {
278        self.is_text
279    }
280}
281
282// ============================================================================
283// 测试
284// ============================================================================
285
286#[cfg(test)]
287mod tests {
288    use super::*;
289
290    #[test]
291    fn test_text_collector_basic() {
292        let collector = TextCollector::new();
293        
294        let root = TestDomNode::new_element("div")
295            .with_child(TestDomNode::new_text("Hello World"));
296        
297        let texts = collector.collect_texts(&root);
298        
299        assert_eq!(texts.len(), 1);
300        assert_eq!(texts[0].text, "Hello World");
301    }
302
303    #[test]
304    fn test_attribute_collection() {
305        let collector = TextCollector::new();
306        
307        let root = TestDomNode::new_element("img")
308            .with_attribute("alt", "Beautiful sunset")
309            .with_attribute("title", "Sunset Photo");
310        
311        let texts = collector.collect_texts(&root);
312        
313        assert_eq!(texts.len(), 2);
314        // 验证包含了alt和title属性
315        let alt_text = texts.iter().find(|t| t.text == "Beautiful sunset").unwrap();
316        assert_eq!(alt_text.text_type, TextType::Alt);
317    }
318
319    #[test]
320    fn test_deduplication() {
321        let collector = TextCollector::new();
322        
323        let root = TestDomNode::new_element("div")
324            .with_child(TestDomNode::new_text("Same text"))
325            .with_child(TestDomNode::new_text("Same text"))
326            .with_child(TestDomNode::new_text("Different text"));
327        
328        let texts = collector.collect_texts(&root);
329        
330        // 应该去重,只剩下2个不同的文本
331        assert_eq!(texts.len(), 2);
332    }
333
334    #[test]
335    fn test_collect_translatable_texts_function() {
336        let root = TestDomNode::new_element("p")
337            .with_child(TestDomNode::new_text("This is a paragraph"));
338        
339        let texts = collect_translatable_texts(&root);
340        assert!(!texts.is_empty());
341    }
342
343    #[test]
344    fn test_grouping_functions() {
345        let items = vec![
346            create_text_item("Title text".to_string(), "h1".to_string()),
347            create_text_item("Button text".to_string(), "button".to_string()),
348        ];
349        
350        let grouped_by_type = group_texts_by_type(items.clone());
351        assert!(!grouped_by_type.is_empty());
352        
353        let grouped_by_priority = group_texts_by_priority(items);
354        assert!(!grouped_by_priority.is_empty());
355    }
356}