html_translation_lib/pipeline/
collector.rs1use crate::error::{TranslationError, TranslationResult};
6use markup5ever_rcdom::{Handle, NodeData, RcDom};
7use std::collections::HashSet;
8
9pub struct TextCollector {
11 skip_tags: HashSet<String>,
13
14 attribute_tags: std::collections::HashMap<String, Vec<String>>,
16}
17
18impl TextCollector {
19 pub fn new() -> Self {
21 let mut skip_tags = HashSet::new();
22 skip_tags.insert("script".to_string());
23 skip_tags.insert("style".to_string());
24 skip_tags.insert("noscript".to_string());
25 skip_tags.insert("code".to_string());
26 skip_tags.insert("pre".to_string());
27
28 let mut attribute_tags = std::collections::HashMap::new();
29 attribute_tags.insert("img".to_string(), vec!["alt".to_string(), "title".to_string()]);
30 attribute_tags.insert("input".to_string(), vec!["placeholder".to_string(), "value".to_string()]);
31 attribute_tags.insert("textarea".to_string(), vec!["placeholder".to_string()]);
32 attribute_tags.insert("a".to_string(), vec!["title".to_string()]);
33
34 Self {
35 skip_tags,
36 attribute_tags,
37 }
38 }
39
40 pub fn collect_from_dom(&self, dom: &RcDom) -> TranslationResult<Vec<TextItem>> {
42 let mut items = Vec::new();
43 self.walk_node(&dom.document, &mut items, 0)?;
44 Ok(items)
45 }
46
47 fn walk_node(&self, node: &Handle, items: &mut Vec<TextItem>, depth: usize) -> TranslationResult<()> {
49 if depth > 100 {
50 return Err(TranslationError::InternalError("DOM深度超限".to_string()));
51 }
52
53 match &node.data {
54 NodeData::Element { name, attrs, .. } => {
55 let tag_name = name.local.to_string();
56
57 if !self.skip_tags.contains(&tag_name) {
59 if let Some(attr_names) = self.attribute_tags.get(&tag_name) {
61 let attrs = attrs.borrow();
62 for attr in attrs.iter() {
63 let attr_name = attr.name.local.to_string();
64 if attr_names.contains(&attr_name) {
65 let text = attr.value.to_string();
66 if !text.trim().is_empty() {
67 items.push(TextItem {
68 text,
69 text_type: TextType::Attribute(attr_name),
70 node: node.clone(),
71 tag_name: tag_name.clone(),
72 });
73 }
74 }
75 }
76 }
77
78 for child in node.children.borrow().iter() {
80 self.walk_node(child, items, depth + 1)?;
81 }
82 }
83 }
84 NodeData::Text { contents } => {
85 let text = contents.borrow().to_string();
86 let trimmed = text.trim();
87 if !trimmed.is_empty() {
88 items.push(TextItem {
89 text: trimmed.to_string(),
90 text_type: TextType::Content,
91 node: node.clone(),
92 tag_name: "text".to_string(),
93 });
94 }
95 }
96 _ => {
97 for child in node.children.borrow().iter() {
99 self.walk_node(child, items, depth + 1)?;
100 }
101 }
102 }
103
104 Ok(())
105 }
106}
107
108impl Default for TextCollector {
109 fn default() -> Self {
110 Self::new()
111 }
112}
113
114#[derive(Debug, Clone)]
116pub struct TextItem {
117 pub text: String,
119
120 pub text_type: TextType,
122
123 pub node: Handle,
125
126 pub tag_name: String,
128}
129
130impl TextItem {
131 pub fn apply_translation(&self, translation: &str) -> TranslationResult<()> {
133 match &self.text_type {
134 TextType::Content => {
135 if let NodeData::Text { contents } = &self.node.data {
136 let mut contents = contents.borrow_mut();
137 *contents = translation.into();
138 }
139 }
140 TextType::Attribute(attr_name) => {
141 if let NodeData::Element { attrs, .. } = &self.node.data {
142 let mut attrs = attrs.borrow_mut();
143 for attr in attrs.iter_mut() {
144 if attr.name.local.as_ref() == attr_name {
145 attr.value = translation.into();
146 break;
147 }
148 }
149 }
150 }
151 _ => {}
152 }
153 Ok(())
154 }
155}
156
157#[derive(Debug, Clone)]
159pub enum TextType {
160 Content,
162
163 Title,
165
166 Link,
168
169 Button,
171
172 FormLabel,
174
175 ImageAlt,
177
178 Tooltip,
180
181 Attribute(String),
183}
184
185#[cfg(test)]
186mod tests {
187 use super::*;
188 use html5ever::parse_document;
189 use html5ever::tendril::TendrilSink;
190 use markup5ever_rcdom::RcDom;
191
192 fn parse_html(html: &str) -> RcDom {
193 parse_document(RcDom::default(), Default::default())
194 .from_utf8()
195 .read_from(&mut html.as_bytes())
196 .unwrap()
197 }
198
199 #[test]
200 fn test_text_collector_new() {
201 let collector = TextCollector::new();
202
203 assert!(collector.skip_tags.contains("script"));
204 assert!(collector.skip_tags.contains("style"));
205 assert!(collector.skip_tags.contains("noscript"));
206 assert!(collector.skip_tags.contains("code"));
207 assert!(collector.skip_tags.contains("pre"));
208
209 assert!(collector.attribute_tags.contains_key("img"));
210 assert!(collector.attribute_tags.contains_key("input"));
211 assert!(collector.attribute_tags.contains_key("textarea"));
212 assert!(collector.attribute_tags.contains_key("a"));
213 }
214}