html_translation_lib/pipeline/
collector.rs

1//! 文本收集器模块
2//!
3//! 从HTML DOM中收集可翻译的文本内容
4
5use crate::error::{TranslationError, TranslationResult};
6use markup5ever_rcdom::{Handle, NodeData, RcDom};
7use std::collections::HashSet;
8
9/// 文本收集器
10pub struct TextCollector {
11    /// 跳过的标签集合
12    skip_tags: HashSet<String>,
13    
14    /// 收集属性文本的标签映射
15    attribute_tags: std::collections::HashMap<String, Vec<String>>,
16}
17
18impl TextCollector {
19    /// 创建新的文本收集器
20    pub fn new() -> Self {
21        let mut skip_tags = HashSet::new();
22        skip_tags.insert("script".to_string());
23        skip_tags.insert("style".to_string());
24        skip_tags.insert("noscript".to_string());
25        skip_tags.insert("code".to_string());
26        skip_tags.insert("pre".to_string());
27        
28        let mut attribute_tags = std::collections::HashMap::new();
29        attribute_tags.insert("img".to_string(), vec!["alt".to_string(), "title".to_string()]);
30        attribute_tags.insert("input".to_string(), vec!["placeholder".to_string(), "value".to_string()]);
31        attribute_tags.insert("textarea".to_string(), vec!["placeholder".to_string()]);
32        attribute_tags.insert("a".to_string(), vec!["title".to_string()]);
33        
34        Self {
35            skip_tags,
36            attribute_tags,
37        }
38    }
39    
40    /// 从DOM中收集文本
41    pub fn collect_from_dom(&self, dom: &RcDom) -> TranslationResult<Vec<TextItem>> {
42        let mut items = Vec::new();
43        self.walk_node(&dom.document, &mut items, 0)?;
44        Ok(items)
45    }
46    
47    /// 递归遍历DOM节点
48    fn walk_node(&self, node: &Handle, items: &mut Vec<TextItem>, depth: usize) -> TranslationResult<()> {
49        if depth > 100 {
50            return Err(TranslationError::InternalError("DOM深度超限".to_string()));
51        }
52        
53        match &node.data {
54            NodeData::Element { name, attrs, .. } => {
55                let tag_name = name.local.to_string();
56                
57                // 检查是否应该跳过此标签
58                if !self.skip_tags.contains(&tag_name) {
59                    // 收集属性文本
60                    if let Some(attr_names) = self.attribute_tags.get(&tag_name) {
61                        let attrs = attrs.borrow();
62                        for attr in attrs.iter() {
63                            let attr_name = attr.name.local.to_string();
64                            if attr_names.contains(&attr_name) {
65                                let text = attr.value.to_string();
66                                if !text.trim().is_empty() {
67                                    items.push(TextItem {
68                                        text,
69                                        text_type: TextType::Attribute(attr_name),
70                                        node: node.clone(),
71                                        tag_name: tag_name.clone(),
72                                    });
73                                }
74                            }
75                        }
76                    }
77                    
78                    // 递归处理子节点
79                    for child in node.children.borrow().iter() {
80                        self.walk_node(child, items, depth + 1)?;
81                    }
82                }
83            }
84            NodeData::Text { contents } => {
85                let text = contents.borrow().to_string();
86                let trimmed = text.trim();
87                if !trimmed.is_empty() {
88                    items.push(TextItem {
89                        text: trimmed.to_string(),
90                        text_type: TextType::Content,
91                        node: node.clone(),
92                        tag_name: "text".to_string(),
93                    });
94                }
95            }
96            _ => {
97                // 处理其他节点类型的子节点
98                for child in node.children.borrow().iter() {
99                    self.walk_node(child, items, depth + 1)?;
100                }
101            }
102        }
103        
104        Ok(())
105    }
106}
107
108impl Default for TextCollector {
109    fn default() -> Self {
110        Self::new()
111    }
112}
113
114/// 文本项结构体
115#[derive(Debug, Clone)]
116pub struct TextItem {
117    /// 文本内容
118    pub text: String,
119    
120    /// 文本类型
121    pub text_type: TextType,
122    
123    /// 对应的DOM节点
124    pub node: Handle,
125    
126    /// 标签名称
127    pub tag_name: String,
128}
129
130impl TextItem {
131    /// 应用翻译到DOM节点
132    pub fn apply_translation(&self, translation: &str) -> TranslationResult<()> {
133        match &self.text_type {
134            TextType::Content => {
135                if let NodeData::Text { contents } = &self.node.data {
136                    let mut contents = contents.borrow_mut();
137                    *contents = translation.into();
138                }
139            }
140            TextType::Attribute(attr_name) => {
141                if let NodeData::Element { attrs, .. } = &self.node.data {
142                    let mut attrs = attrs.borrow_mut();
143                    for attr in attrs.iter_mut() {
144                        if attr.name.local.as_ref() == attr_name {
145                            attr.value = translation.into();
146                            break;
147                        }
148                    }
149                }
150            }
151            _ => {}
152        }
153        Ok(())
154    }
155}
156
157/// 文本类型枚举
158#[derive(Debug, Clone)]
159pub enum TextType {
160    /// 文本内容
161    Content,
162    
163    /// 标题文本
164    Title,
165    
166    /// 链接文本
167    Link,
168    
169    /// 按钮文本
170    Button,
171    
172    /// 表单标签
173    FormLabel,
174    
175    /// 图片alt属性
176    ImageAlt,
177    
178    /// 工具提示
179    Tooltip,
180    
181    /// 属性值
182    Attribute(String),
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188    use html5ever::parse_document;
189    use html5ever::tendril::TendrilSink;
190    use markup5ever_rcdom::RcDom;
191
192    fn parse_html(html: &str) -> RcDom {
193        parse_document(RcDom::default(), Default::default())
194            .from_utf8()
195            .read_from(&mut html.as_bytes())
196            .unwrap()
197    }
198
199    #[test]
200    fn test_text_collector_new() {
201        let collector = TextCollector::new();
202        
203        assert!(collector.skip_tags.contains("script"));
204        assert!(collector.skip_tags.contains("style"));
205        assert!(collector.skip_tags.contains("noscript"));
206        assert!(collector.skip_tags.contains("code"));
207        assert!(collector.skip_tags.contains("pre"));
208        
209        assert!(collector.attribute_tags.contains_key("img"));
210        assert!(collector.attribute_tags.contains_key("input"));
211        assert!(collector.attribute_tags.contains_key("textarea"));
212        assert!(collector.attribute_tags.contains_key("a"));
213    }
214}