html_translation_lib/pipeline/
optimized_collector.rs1use crate::error::{TranslationError, TranslationResult};
6use markup5ever_rcdom::{Handle, NodeData, RcDom};
7use std::collections::{HashMap, HashSet, VecDeque};
8use std::rc::Rc;
9use std::borrow::Cow;
10
11pub struct OptimizedTextCollector {
13 skip_tags: HashSet<&'static str>,
15
16 attribute_tags: HashMap<&'static str, &'static [&'static str]>,
18
19 work_queue: VecDeque<(Handle, u8)>, }
22
23impl OptimizedTextCollector {
24 pub fn new() -> Self {
26 let mut skip_tags = HashSet::with_capacity(8);
27 skip_tags.insert("script");
28 skip_tags.insert("style");
29 skip_tags.insert("noscript");
30 skip_tags.insert("code");
31 skip_tags.insert("pre");
32 skip_tags.insert("svg");
33 skip_tags.insert("math");
34
35 let mut attribute_tags = HashMap::with_capacity(8);
36 attribute_tags.insert("img", &["alt", "title"][..]);
37 attribute_tags.insert("input", &["placeholder", "value", "title"][..]);
38 attribute_tags.insert("textarea", &["placeholder", "title"][..]);
39 attribute_tags.insert("a", &["title"][..]);
40 attribute_tags.insert("button", &["title"][..]);
41 attribute_tags.insert("abbr", &["title"][..]);
42 attribute_tags.insert("area", &["alt"][..]);
43 attribute_tags.insert("track", &["label"][..]);
44
45 Self {
46 skip_tags,
47 attribute_tags,
48 work_queue: VecDeque::with_capacity(1024), }
50 }
51
52 pub fn collect_from_dom_optimized(&mut self, dom: &RcDom) -> TranslationResult<Vec<OptimizedTextItem>> {
54 let mut items = Vec::with_capacity(256);
56
57 self.work_queue.clear();
59
60 self.work_queue.push_back((dom.document.clone(), 0));
62
63 while let Some((node, depth)) = self.work_queue.pop_front() {
64 if depth > 100 {
65 return Err(TranslationError::InternalError("DOM深度超限".to_string()));
66 }
67
68 self.process_node(&node, &mut items, depth)?;
69 }
70
71 items.sort_by(|a, b| {
73 a.text_type.discriminant().cmp(&b.text_type.discriminant())
74 });
75
76 Ok(items)
77 }
78
79 #[inline]
81 fn process_node(
82 &mut self,
83 node: &Handle,
84 items: &mut Vec<OptimizedTextItem>,
85 depth: u8,
86 ) -> TranslationResult<()> {
87 match &node.data {
88 NodeData::Element { name, attrs, .. } => {
89 let tag_name = name.local.as_ref();
90
91 if !self.skip_tags.contains(tag_name) {
93 if let Some(&attr_names) = self.attribute_tags.get(tag_name) {
95 let attrs = attrs.borrow();
96 for attr_name in attr_names {
97 for attr in attrs.iter() {
98 let attr_local_name = attr.name.local.as_ref();
99 if *attr_name == attr_local_name {
100 let text = attr.value.as_ref();
101 if !text.trim().is_empty() {
102 items.push(OptimizedTextItem::new(
103 text.to_string(),
104 OptimizedTextType::Attribute(attr_name.to_string()),
105 node.clone(),
106 tag_name,
107 ));
108 }
109 }
110 }
111 }
112 }
113
114 let children = node.children.borrow();
116 for child in children.iter().rev() {
117 self.work_queue.push_front((child.clone(), depth + 1));
118 }
119 }
120 }
121 NodeData::Text { contents } => {
122 let contents_ref = contents.borrow();
123 let text = contents_ref.as_ref();
124 let trimmed = text.trim();
125 if !trimmed.is_empty() && self.is_translatable_text(trimmed) {
126 items.push(OptimizedTextItem::new(
127 trimmed.to_string(),
128 OptimizedTextType::Content,
129 node.clone(),
130 "text",
131 ));
132 }
133 }
134 NodeData::Comment { .. } => {
135 }
137 _ => {
138 let children = node.children.borrow();
140 for child in children.iter().rev() {
141 self.work_queue.push_front((child.clone(), depth));
142 }
143 }
144 }
145
146 Ok(())
147 }
148
149 #[inline]
151 fn is_translatable_text(&self, text: &str) -> bool {
152 if text.len() < 2 {
154 return false;
155 }
156
157 if text.chars().all(|c| c.is_ascii_digit() || c.is_ascii_punctuation()) {
159 return false;
160 }
161
162 if text.starts_with("http") || text.starts_with("www.") || text.contains("://") {
164 return false;
165 }
166
167 true
168 }
169}
170
171#[derive(Debug, Clone)]
173pub struct OptimizedTextItem {
174 pub text: String,
176
177 pub text_type: OptimizedTextType,
179
180 pub node: Handle,
182
183 pub tag_name: Cow<'static, str>,
185
186 pub hash: u64,
188}
189
190impl OptimizedTextItem {
191 pub fn new(text: String, text_type: OptimizedTextType, node: Handle, tag_name: &str) -> Self {
193 use std::collections::hash_map::DefaultHasher;
194 use std::hash::{Hash, Hasher};
195
196 let mut hasher = DefaultHasher::new();
197 text.hash(&mut hasher);
198 text_type.discriminant().hash(&mut hasher);
199 let hash = hasher.finish();
200
201 Self {
202 text,
203 text_type,
204 node,
205 tag_name: Cow::Owned(tag_name.to_string()),
206 hash,
207 }
208 }
209
210 pub fn apply_translation_optimized(&self, translation: &str) -> TranslationResult<()> {
212 match &self.text_type {
213 OptimizedTextType::Content => {
214 if let NodeData::Text { contents } = &self.node.data {
215 let mut contents = contents.borrow_mut();
216 *contents = translation.into();
217 }
218 }
219 OptimizedTextType::Attribute(attr_name) => {
220 if let NodeData::Element { attrs, .. } = &self.node.data {
221 let mut attrs = attrs.borrow_mut();
222 for attr in attrs.iter_mut() {
224 if attr.name.local.as_ref() == attr_name {
225 attr.value = translation.into();
226 break;
227 }
228 }
229 }
230 }
231 _ => {}
232 }
233 Ok(())
234 }
235}
236
237#[derive(Debug, Clone)]
239pub enum OptimizedTextType {
240 Content,
242
243 Title,
245
246 Attribute(String),
248
249 Meta(String),
251}
252
253impl OptimizedTextType {
254 pub fn discriminant(&self) -> u8 {
256 match self {
257 OptimizedTextType::Content => 0,
258 OptimizedTextType::Title => 1,
259 OptimizedTextType::Attribute(_) => 2,
260 OptimizedTextType::Meta(_) => 3,
261 }
262 }
263}
264
265pub struct BatchTextProcessor {
267 collector: OptimizedTextCollector,
268 item_pool: Vec<OptimizedTextItem>,
270 string_pool: Vec<String>,
272}
273
274impl BatchTextProcessor {
275 pub fn new() -> Self {
277 Self {
278 collector: OptimizedTextCollector::new(),
279 item_pool: Vec::with_capacity(512),
280 string_pool: Vec::with_capacity(256),
281 }
282 }
283
284 pub fn process_batch(&mut self, doms: &[&RcDom]) -> TranslationResult<Vec<Vec<OptimizedTextItem>>> {
286 let mut results = Vec::with_capacity(doms.len());
287
288 for dom in doms {
289 let items = self.collector.collect_from_dom_optimized(dom)?;
290 results.push(items);
291 }
292
293 Ok(results)
294 }
295
296 pub fn release_items(&mut self, items: Vec<OptimizedTextItem>) {
298 self.item_pool.extend(items);
299
300 if self.item_pool.len() > 1024 {
302 self.item_pool.truncate(512);
303 }
304 }
305}
306
307impl Default for OptimizedTextCollector {
308 fn default() -> Self {
309 Self::new()
310 }
311}
312
313impl Default for BatchTextProcessor {
314 fn default() -> Self {
315 Self::new()
316 }
317}
318
319#[cfg(test)]
320mod tests {
321 use super::*;
322 use html5ever::parse_document;
323 use html5ever::tendril::TendrilSink;
324
325 fn parse_html(html: &str) -> RcDom {
326 parse_document(RcDom::default(), Default::default())
327 .from_utf8()
328 .read_from(&mut html.as_bytes())
329 .unwrap()
330 }
331
332 #[test]
333 fn test_optimized_collector_performance() {
334 let mut collector = OptimizedTextCollector::new();
335 let html = r#"
336 <html>
337 <body>
338 <h1>Title</h1>
339 <p>Paragraph 1</p>
340 <p>Paragraph 2</p>
341 <img src="test.jpg" alt="Test Image">
342 </body>
343 </html>
344 "#;
345
346 let dom = parse_html(html);
347 let items = collector.collect_from_dom_optimized(&dom).unwrap();
348
349 assert!(!items.is_empty());
350
351 let texts: Vec<&str> = items.iter().map(|item| item.text.as_str()).collect();
353 assert!(texts.contains(&"Title"));
354 assert!(texts.contains(&"Paragraph 1"));
355 assert!(texts.contains(&"Paragraph 2"));
356 assert!(texts.contains(&"Test Image"));
357 }
358
359 #[test]
360 fn test_batch_processor() {
361 let mut processor = BatchTextProcessor::new();
362 let html1 = "<p>Text 1</p>";
363 let html2 = "<p>Text 2</p>";
364
365 let dom1 = parse_html(html1);
366 let dom2 = parse_html(html2);
367 let doms = vec![&dom1, &dom2];
368
369 let results = processor.process_batch(&doms).unwrap();
370 assert_eq!(results.len(), 2);
371 }
372}