html_translation_lib/pipeline/
optimized_collector.rs

1//! 优化的DOM遍历实现
2//!
3//! 使用迭代器模式和栈结构替代递归,提高性能并避免栈溢出
4
5use crate::error::{TranslationError, TranslationResult};
6use markup5ever_rcdom::{Handle, NodeData, RcDom};
7use std::collections::{HashMap, HashSet, VecDeque};
8use std::rc::Rc;
9use std::borrow::Cow;
10
11/// 优化的文本收集器
12pub struct OptimizedTextCollector {
13    /// 跳过的标签集合(使用更快的查找)
14    skip_tags: HashSet<&'static str>,
15    
16    /// 收集属性文本的标签映射(使用静态字符串)
17    attribute_tags: HashMap<&'static str, &'static [&'static str]>,
18    
19    /// 预分配的工作缓冲区
20    work_queue: VecDeque<(Handle, u8)>, // (节点, 深度)
21}
22
23impl OptimizedTextCollector {
24    /// 创建新的优化文本收集器
25    pub fn new() -> Self {
26        let mut skip_tags = HashSet::with_capacity(8);
27        skip_tags.insert("script");
28        skip_tags.insert("style");
29        skip_tags.insert("noscript");
30        skip_tags.insert("code");
31        skip_tags.insert("pre");
32        skip_tags.insert("svg");
33        skip_tags.insert("math");
34        
35        let mut attribute_tags = HashMap::with_capacity(8);
36        attribute_tags.insert("img", &["alt", "title"][..]);
37        attribute_tags.insert("input", &["placeholder", "value", "title"][..]);
38        attribute_tags.insert("textarea", &["placeholder", "title"][..]);
39        attribute_tags.insert("a", &["title"][..]);
40        attribute_tags.insert("button", &["title"][..]);
41        attribute_tags.insert("abbr", &["title"][..]);
42        attribute_tags.insert("area", &["alt"][..]);
43        attribute_tags.insert("track", &["label"][..]);
44        
45        Self {
46            skip_tags,
47            attribute_tags,
48            work_queue: VecDeque::with_capacity(1024), // 预分配容量
49        }
50    }
51    
52    /// 从DOM中收集文本(优化版本)
53    pub fn collect_from_dom_optimized(&mut self, dom: &RcDom) -> TranslationResult<Vec<OptimizedTextItem>> {
54        // 预估容量以减少重分配
55        let mut items = Vec::with_capacity(256);
56        
57        // 清空工作队列
58        self.work_queue.clear();
59        
60        // 使用迭代代替递归
61        self.work_queue.push_back((dom.document.clone(), 0));
62        
63        while let Some((node, depth)) = self.work_queue.pop_front() {
64            if depth > 100 {
65                return Err(TranslationError::InternalError("DOM深度超限".to_string()));
66            }
67            
68            self.process_node(&node, &mut items, depth)?;
69        }
70        
71        // 优化:按类型分组,提高后续处理效率
72        items.sort_by(|a, b| {
73            a.text_type.discriminant().cmp(&b.text_type.discriminant())
74        });
75        
76        Ok(items)
77    }
78    
79    /// 处理单个节点(内联以提高性能)
80    #[inline]
81    fn process_node(
82        &mut self,
83        node: &Handle,
84        items: &mut Vec<OptimizedTextItem>,
85        depth: u8,
86    ) -> TranslationResult<()> {
87        match &node.data {
88            NodeData::Element { name, attrs, .. } => {
89                let tag_name = name.local.as_ref();
90                
91                // 快速跳过检查
92                if !self.skip_tags.contains(tag_name) {
93                    // 收集属性文本
94                    if let Some(&attr_names) = self.attribute_tags.get(tag_name) {
95                        let attrs = attrs.borrow();
96                        for attr_name in attr_names {
97                            for attr in attrs.iter() {
98                                let attr_local_name = attr.name.local.as_ref();
99                                if *attr_name == attr_local_name {
100                                    let text = attr.value.as_ref();
101                                    if !text.trim().is_empty() {
102                                        items.push(OptimizedTextItem::new(
103                                            text.to_string(),
104                                            OptimizedTextType::Attribute(attr_name.to_string()),
105                                            node.clone(),
106                                            tag_name,
107                                        ));
108                                    }
109                                }
110                            }
111                        }
112                    }
113                    
114                    // 添加子节点到队列(逆序添加以保持DOM顺序)
115                    let children = node.children.borrow();
116                    for child in children.iter().rev() {
117                        self.work_queue.push_front((child.clone(), depth + 1));
118                    }
119                }
120            }
121            NodeData::Text { contents } => {
122                let contents_ref = contents.borrow();
123                let text = contents_ref.as_ref();
124                let trimmed = text.trim();
125                if !trimmed.is_empty() && self.is_translatable_text(trimmed) {
126                    items.push(OptimizedTextItem::new(
127                        trimmed.to_string(),
128                        OptimizedTextType::Content,
129                        node.clone(),
130                        "text",
131                    ));
132                }
133            }
134            NodeData::Comment { .. } => {
135                // 跳过注释节点
136            }
137            _ => {
138                // 处理其他类型的节点(如Document节点)
139                let children = node.children.borrow();
140                for child in children.iter().rev() {
141                    self.work_queue.push_front((child.clone(), depth));
142                }
143            }
144        }
145        
146        Ok(())
147    }
148    
149    /// 快速检查文本是否可翻译
150    #[inline]
151    fn is_translatable_text(&self, text: &str) -> bool {
152        // 快速长度检查
153        if text.len() < 2 {
154            return false;
155        }
156        
157        // 快速ASCII检查(纯数字、特殊符号等)
158        if text.chars().all(|c| c.is_ascii_digit() || c.is_ascii_punctuation()) {
159            return false;
160        }
161        
162        // 快速URL检查
163        if text.starts_with("http") || text.starts_with("www.") || text.contains("://") {
164            return false;
165        }
166        
167        true
168    }
169}
170
171/// 优化的文本项
172#[derive(Debug, Clone)]
173pub struct OptimizedTextItem {
174    /// 文本内容
175    pub text: String,
176    
177    /// 文本类型
178    pub text_type: OptimizedTextType,
179    
180    /// 对应的DOM节点
181    pub node: Handle,
182    
183    /// 标签名称(使用Cow避免不必要的分配)
184    pub tag_name: Cow<'static, str>,
185    
186    /// 缓存的哈希值(用于去重和缓存查找)
187    pub hash: u64,
188}
189
190impl OptimizedTextItem {
191    /// 创建新的优化文本项
192    pub fn new(text: String, text_type: OptimizedTextType, node: Handle, tag_name: &str) -> Self {
193        use std::collections::hash_map::DefaultHasher;
194        use std::hash::{Hash, Hasher};
195        
196        let mut hasher = DefaultHasher::new();
197        text.hash(&mut hasher);
198        text_type.discriminant().hash(&mut hasher);
199        let hash = hasher.finish();
200        
201        Self {
202            text,
203            text_type,
204            node,
205            tag_name: Cow::Owned(tag_name.to_string()),
206            hash,
207        }
208    }
209    
210    /// 应用翻译到DOM节点(优化版本)
211    pub fn apply_translation_optimized(&self, translation: &str) -> TranslationResult<()> {
212        match &self.text_type {
213            OptimizedTextType::Content => {
214                if let NodeData::Text { contents } = &self.node.data {
215                    let mut contents = contents.borrow_mut();
216                    *contents = translation.into();
217                }
218            }
219            OptimizedTextType::Attribute(attr_name) => {
220                if let NodeData::Element { attrs, .. } = &self.node.data {
221                    let mut attrs = attrs.borrow_mut();
222                    // 优化:使用二分查找或更高效的方法
223                    for attr in attrs.iter_mut() {
224                        if attr.name.local.as_ref() == attr_name {
225                            attr.value = translation.into();
226                            break;
227                        }
228                    }
229                }
230            }
231            _ => {}
232        }
233        Ok(())
234    }
235}
236
237/// 优化的文本类型枚举
238#[derive(Debug, Clone)]
239pub enum OptimizedTextType {
240    /// 文本内容
241    Content,
242    
243    /// 标题文本
244    Title,
245    
246    /// 属性值
247    Attribute(String),
248    
249    /// 元数据
250    Meta(String),
251}
252
253impl OptimizedTextType {
254    /// 获取类型判别符(用于排序和分组)
255    pub fn discriminant(&self) -> u8 {
256        match self {
257            OptimizedTextType::Content => 0,
258            OptimizedTextType::Title => 1,
259            OptimizedTextType::Attribute(_) => 2,
260            OptimizedTextType::Meta(_) => 3,
261        }
262    }
263}
264
265/// 批量文本处理器
266pub struct BatchTextProcessor {
267    collector: OptimizedTextCollector,
268    // 对象池,重用TextItem
269    item_pool: Vec<OptimizedTextItem>,
270    // 字符串缓冲区池
271    string_pool: Vec<String>,
272}
273
274impl BatchTextProcessor {
275    /// 创建新的批量处理器
276    pub fn new() -> Self {
277        Self {
278            collector: OptimizedTextCollector::new(),
279            item_pool: Vec::with_capacity(512),
280            string_pool: Vec::with_capacity(256),
281        }
282    }
283    
284    /// 批量处理多个DOM
285    pub fn process_batch(&mut self, doms: &[&RcDom]) -> TranslationResult<Vec<Vec<OptimizedTextItem>>> {
286        let mut results = Vec::with_capacity(doms.len());
287        
288        for dom in doms {
289            let items = self.collector.collect_from_dom_optimized(dom)?;
290            results.push(items);
291        }
292        
293        Ok(results)
294    }
295    
296    /// 释放对象回池中以供重用
297    pub fn release_items(&mut self, items: Vec<OptimizedTextItem>) {
298        self.item_pool.extend(items);
299        
300        // 限制池大小以避免内存泄漏
301        if self.item_pool.len() > 1024 {
302            self.item_pool.truncate(512);
303        }
304    }
305}
306
307impl Default for OptimizedTextCollector {
308    fn default() -> Self {
309        Self::new()
310    }
311}
312
313impl Default for BatchTextProcessor {
314    fn default() -> Self {
315        Self::new()
316    }
317}
318
319#[cfg(test)]
320mod tests {
321    use super::*;
322    use html5ever::parse_document;
323    use html5ever::tendril::TendrilSink;
324    
325    fn parse_html(html: &str) -> RcDom {
326        parse_document(RcDom::default(), Default::default())
327            .from_utf8()
328            .read_from(&mut html.as_bytes())
329            .unwrap()
330    }
331    
332    #[test]
333    fn test_optimized_collector_performance() {
334        let mut collector = OptimizedTextCollector::new();
335        let html = r#"
336        <html>
337        <body>
338            <h1>Title</h1>
339            <p>Paragraph 1</p>
340            <p>Paragraph 2</p>
341            <img src="test.jpg" alt="Test Image">
342        </body>
343        </html>
344        "#;
345        
346        let dom = parse_html(html);
347        let items = collector.collect_from_dom_optimized(&dom).unwrap();
348        
349        assert!(!items.is_empty());
350        
351        // 验证文本收集正确性
352        let texts: Vec<&str> = items.iter().map(|item| item.text.as_str()).collect();
353        assert!(texts.contains(&"Title"));
354        assert!(texts.contains(&"Paragraph 1"));
355        assert!(texts.contains(&"Paragraph 2"));
356        assert!(texts.contains(&"Test Image"));
357    }
358    
359    #[test]
360    fn test_batch_processor() {
361        let mut processor = BatchTextProcessor::new();
362        let html1 = "<p>Text 1</p>";
363        let html2 = "<p>Text 2</p>";
364        
365        let dom1 = parse_html(html1);
366        let dom2 = parse_html(html2);
367        let doms = vec![&dom1, &dom2];
368        
369        let results = processor.process_batch(&doms).unwrap();
370        assert_eq!(results.len(), 2);
371    }
372}