html_translation_lib/
core.rs

1//! 核心翻译器实现
2//!
3//! 提供HTML翻译的核心功能和主要API
4
5#![allow(dead_code)] // 暂时允许未使用代码,这些是库的公共API
6
7use crate::config::TranslationConfig;
8use crate::error::{TranslationError, TranslationResult};
9use crate::pipeline::{TextCollector, TextFilter, BatchManager, TextItem};
10use crate::storage::CacheManager;
11use markup5ever_rcdom::RcDom;
12use std::collections::HashMap;
13
14/// HTML翻译器
15///
16/// 这是库的主要接口,提供完整的HTML翻译功能
17#[allow(dead_code)] // 允许未使用的字段,这些是公共API的一部分
18pub struct HtmlTranslator {
19    /// 翻译配置
20    config: TranslationConfig,
21    
22    /// 文本收集器
23    text_collector: TextCollector,
24    
25    /// 文本过滤器
26    text_filter: TextFilter,
27    
28    /// 批次管理器
29    batch_manager: BatchManager,
30    
31    /// 缓存管理器
32    cache_manager: Option<std::sync::Arc<tokio::sync::Mutex<CacheManager>>>,
33    
34    /// HTTP客户端
35    #[cfg(feature = "async")]
36    http_client: reqwest::Client,
37    
38    /// 翻译统计
39    stats: TranslationStats,
40}
41
42impl HtmlTranslator {
43    /// 创建新的翻译器实例
44    #[cfg(feature = "async")]
45    #[allow(dead_code)] // 公共API方法
46    pub async fn new(config: TranslationConfig) -> TranslationResult<Self> {
47        config.validate()?;
48        
49        let text_collector = TextCollector::new();
50        let text_filter = TextFilter::new(&config);
51        let batch_manager = BatchManager::new(&config);
52        
53        let cache_manager = if config.enable_cache {
54            Some(std::sync::Arc::new(tokio::sync::Mutex::new(CacheManager::new(&config).await?)))
55        } else {
56            None
57        };
58        
59        let http_client = reqwest::ClientBuilder::new()
60            .timeout(config.http_timeout())
61            .user_agent(&config.user_agent)
62            .build()
63            .map_err(|e| TranslationError::ConfigError(format!("HTTP客户端初始化失败: {e}")))?;
64        
65        Ok(Self {
66            config,
67            text_collector,
68            text_filter,
69            batch_manager,
70            cache_manager,
71            http_client,
72            stats: TranslationStats::default(),
73        })
74    }
75    
76    /// 翻译HTML字符串
77    #[cfg(feature = "async")]
78    #[allow(dead_code)] // 公共API方法
79    pub async fn translate_html(&mut self, html: &str) -> TranslationResult<String> {
80        tracing::info!("开始翻译HTML内容,长度: {} 字符", html.len());
81        
82        // 解析HTML
83        let dom = self.parse_html(html)?;
84        
85        // 翻译DOM
86        let translated_dom = self.translate_dom(dom).await?;
87        
88        // 序列化回HTML字符串
89        let result = self.serialize_dom(translated_dom)?;
90        
91        tracing::info!("HTML翻译完成");
92        Ok(result)
93    }
94    
95    /// 翻译HTML文件
96    #[cfg(feature = "async")]
97    pub async fn translate_file<P: AsRef<std::path::Path>>(
98        &mut self,
99        input_path: P,
100        output_path: P,
101    ) -> TranslationResult<()> {
102        let html_content = std::fs::read_to_string(&input_path)?;
103        let translated_content = self.translate_html(&html_content).await?;
104        std::fs::write(output_path, translated_content)?;
105        Ok(())
106    }
107    
108    /// 翻译DOM树
109    #[cfg(feature = "async")]
110    pub async fn translate_dom(&mut self, dom: RcDom) -> TranslationResult<RcDom> {
111        let start_time = std::time::Instant::now();
112        
113        // 1. 收集可翻译文本
114        let texts = self.text_collector.collect_from_dom(&dom)?;
115        tracing::debug!("收集到 {} 个文本项", texts.len());
116        self.stats.texts_collected = texts.len();
117        
118        if texts.is_empty() {
119            return Ok(dom);
120        }
121        
122        // 2. 过滤文本
123        let filtered_texts: Vec<_> = texts
124            .into_iter()
125            .filter(|item| self.text_filter.should_translate(&item.text))
126            .collect();
127        tracing::debug!("过滤后剩余 {} 个文本项", filtered_texts.len());
128        self.stats.texts_filtered = filtered_texts.len();
129        
130        if filtered_texts.is_empty() {
131            return Ok(dom);
132        }
133        
134        // 3. 检查缓存
135        let (cached, uncached) = self.check_cache(&filtered_texts).await?;
136        tracing::debug!("缓存命中 {} 项,需要翻译 {} 项", cached.len(), uncached.len());
137        self.stats.cache_hits = cached.len();
138        self.stats.cache_misses = uncached.len();
139        
140        // 4. 创建翻译批次
141        let batches = self.batch_manager.create_batches(uncached);
142        tracing::debug!("创建了 {} 个翻译批次", batches.len());
143        self.stats.batches_created = batches.len();
144        
145        // 5. 执行翻译
146        let mut all_translations = cached;
147        for batch in batches {
148            let translations = self.translate_batch(&batch.items).await?;
149            all_translations.extend(translations);
150        }
151        
152        // 6. 更新缓存
153        self.update_cache(&all_translations).await?;
154        
155        // 7. 应用翻译到DOM
156        let updated_dom = self.apply_translations_to_dom(dom, all_translations)?;
157        
158        self.stats.processing_time = start_time.elapsed();
159        tracing::info!("DOM翻译完成,耗时: {:?}", self.stats.processing_time);
160        
161        Ok(updated_dom)
162    }
163    
164    /// 解析HTML字符串为DOM
165    fn parse_html(&self, html: &str) -> TranslationResult<RcDom> {
166        use html5ever::parse_document;
167        use html5ever::tendril::TendrilSink;
168        use markup5ever_rcdom::RcDom;
169        
170        let dom = parse_document(RcDom::default(), Default::default())
171            .from_utf8()
172            .read_from(&mut html.as_bytes())
173            .map_err(|e| TranslationError::ParseError(format!("HTML解析失败: {e:?}")))?;
174        
175        Ok(dom)
176    }
177    
178    /// 序列化DOM为HTML字符串
179    fn serialize_dom(&self, dom: RcDom) -> TranslationResult<String> {
180        use html5ever::serialize::{serialize, SerializeOpts};
181        use markup5ever_rcdom::SerializableHandle;
182        use std::io::Cursor;
183        
184        let mut buffer = Vec::new();
185        let cursor = Cursor::new(&mut buffer);
186        
187        serialize(cursor, &SerializableHandle::from(dom.document.clone()), SerializeOpts::default())
188            .map_err(|e| TranslationError::ParseError(format!("HTML序列化失败: {e:?}")))?;
189        
190        String::from_utf8(buffer)
191            .map_err(|e| TranslationError::ParseError(format!("UTF-8转换失败: {e}")))
192    }
193    
194    /// 检查缓存
195    #[cfg(feature = "async")]
196    async fn check_cache(
197        &self,
198        texts: &[TextItem],
199    ) -> TranslationResult<(Vec<(TextItem, String)>, Vec<TextItem>)> {
200        if let Some(cache_arc) = &self.cache_manager {
201            let cache = cache_arc.lock().await;
202            let mut cached = Vec::new();
203            let mut uncached = Vec::new();
204            
205            for item in texts {
206                if let Some(translation) = cache.get(&item.text).await? {
207                    cached.push((item.clone(), translation));
208                } else {
209                    uncached.push(item.clone());
210                }
211            }
212            
213            Ok((cached, uncached))
214        } else {
215            Ok((Vec::new(), texts.to_vec()))
216        }
217    }
218    
219    /// 更新缓存
220    #[cfg(feature = "async")]
221    async fn update_cache(&self, translations: &[(TextItem, String)]) -> TranslationResult<()> {
222        if let Some(cache_arc) = &self.cache_manager {
223            let mut cache = cache_arc.lock().await;
224            for (item, translation) in translations {
225                cache.set(&item.text, translation.clone()).await?;
226            }
227        }
228        Ok(())
229    }
230    
231    /// 翻译单个批次
232    #[cfg(feature = "async")]
233    async fn translate_batch(&self, batch: &[TextItem]) -> TranslationResult<Vec<(TextItem, String)>> {
234        if batch.is_empty() {
235            return Ok(Vec::new());
236        }
237        
238        // 准备翻译请求
239        let texts_to_translate: Vec<String> = batch.iter()
240            .enumerate()
241            .map(|(i, item)| {
242                if self.config.use_indexing {
243                    format!("[{}]{}", i, item.text)
244                } else {
245                    item.text.clone()
246                }
247            })
248            .collect();
249        
250        let combined_text = texts_to_translate.join("\n\n");
251        
252        // 执行翻译请求
253        let translated_text = self.call_translation_api(&combined_text).await?;
254        
255        // 解析翻译结果
256        let translations = self.parse_translation_response(&translated_text, batch.len())?;
257        
258        // 组合结果
259        let result: Vec<(TextItem, String)> = batch.iter()
260            .zip(translations.iter())
261            .map(|(item, translation)| (item.clone(), translation.clone()))
262            .collect();
263        
264        Ok(result)
265    }
266    
267    /// 调用翻译API
268    #[cfg(feature = "async")]
269    async fn call_translation_api(&self, text: &str) -> TranslationResult<String> {
270        let mut attempts = 0;
271        let max_attempts = self.config.max_retries + 1;
272        
273        while attempts < max_attempts {
274            let request_body = serde_json::json!({
275                "text": text,
276                "source_lang": self.config.source_language,
277                "target_lang": self.config.target_language
278            });
279            
280            let mut request = self.http_client
281                .post(&self.config.api_url)
282                .json(&request_body);
283            
284            if let Some(ref api_key) = self.config.api_key {
285                request = request.header("Authorization", format!("Bearer {api_key}"));
286            }
287            
288            match request.send().await {
289                Ok(response) => {
290                    let status = response.status();
291                    if status.is_success() {
292                        let result: serde_json::Value = response.json().await?;
293                        if let Some(translated) = result.get("data").and_then(|d| d.as_str()) {
294                            return Ok(translated.to_string());
295                        } else if let Some(translated) = result.get("translated_text").and_then(|t| t.as_str()) {
296                            return Ok(translated.to_string());
297                        } else {
298                            return Err(TranslationError::ServiceError(
299                                "翻译响应格式无效".to_string()
300                            ));
301                        }
302                    } else {
303                        let error_msg = response.text().await.unwrap_or_else(|_| "未知错误".to_string());
304                        return Err(TranslationError::ServiceError(format!(
305                            "翻译API返回错误 {status}: {error_msg}"
306                        )));
307                    }
308                }
309                Err(e) => {
310                    attempts += 1;
311                    if attempts >= max_attempts {
312                        return Err(TranslationError::from(e));
313                    }
314                    
315                    tracing::warn!("翻译请求失败,正在重试 ({}/{}): {}", attempts, max_attempts, e);
316                    tokio::time::sleep(self.config.retry_delay()).await;
317                }
318            }
319        }
320        
321        Err(TranslationError::ServiceError("翻译请求重试失败".to_string()))
322    }
323    
324    /// 解析翻译响应
325    fn parse_translation_response(&self, response: &str, expected_count: usize) -> TranslationResult<Vec<String>> {
326        let parts: Vec<&str> = response.split("\n\n").collect();
327        
328        if self.config.use_indexing {
329            // 使用索引标记解析
330            let mut indexed_results = HashMap::new();
331            let index_regex = regex::Regex::new(r"^\[(\d+)\](.*)$").unwrap();
332            
333            for part in parts {
334                if let Some(captures) = index_regex.captures(part.trim()) {
335                    if let (Some(index_match), Some(text_match)) = (captures.get(1), captures.get(2)) {
336                        if let Ok(index) = index_match.as_str().parse::<usize>() {
337                            indexed_results.insert(index, text_match.as_str().trim().to_string());
338                        }
339                    }
340                }
341            }
342            
343            let mut results = Vec::with_capacity(expected_count);
344            for i in 0..expected_count {
345                results.push(indexed_results.get(&i).cloned().unwrap_or_default());
346            }
347            
348            Ok(results)
349        } else {
350            // 简单分割解析
351            let results: Vec<String> = parts.iter()
352                .map(|s| s.trim().to_string())
353                .collect();
354            
355            if results.len() != expected_count {
356                tracing::warn!("翻译结果数量不匹配: 期望 {}, 得到 {}", expected_count, results.len());
357            }
358            
359            Ok(results)
360        }
361    }
362    
363    /// 应用翻译到DOM
364    fn apply_translations_to_dom(
365        &self,
366        dom: RcDom,
367        translations: Vec<(TextItem, String)>,
368    ) -> TranslationResult<RcDom> {
369        for (item, translation) in translations {
370            if !translation.trim().is_empty() {
371                item.apply_translation(&translation)?;
372            }
373        }
374        
375        Ok(dom)
376    }
377    
378    /// 获取翻译统计信息
379    pub fn get_stats(&self) -> &TranslationStats {
380        &self.stats
381    }
382    
383    /// 重置统计信息
384    pub fn reset_stats(&mut self) {
385        self.stats = TranslationStats::default();
386    }
387}
388
389/// 翻译统计信息
390#[derive(Debug, Default, Clone)]
391pub struct TranslationStats {
392    /// 收集到的文本数量
393    pub texts_collected: usize,
394    
395    /// 过滤后的文本数量
396    pub texts_filtered: usize,
397    
398    /// 缓存命中次数
399    pub cache_hits: usize,
400    
401    /// 缓存未命中次数
402    pub cache_misses: usize,
403    
404    /// 创建的批次数量
405    pub batches_created: usize,
406    
407    /// 处理时间
408    pub processing_time: std::time::Duration,
409    
410    /// API调用次数
411    pub api_calls: usize,
412    
413    /// 错误次数
414    pub errors: usize,
415}
416
417impl TranslationStats {
418    /// 计算缓存命中率
419    pub fn cache_hit_rate(&self) -> f32 {
420        let total = self.cache_hits + self.cache_misses;
421        if total > 0 {
422            self.cache_hits as f32 / total as f32
423        } else {
424            0.0
425        }
426    }
427    
428    /// 计算平均批次大小
429    pub fn average_batch_size(&self) -> f32 {
430        if self.batches_created > 0 {
431            self.texts_filtered as f32 / self.batches_created as f32
432        } else {
433            0.0
434        }
435    }
436}
437