1#![allow(dead_code)] use crate::config::TranslationConfig;
8use crate::error::{TranslationError, TranslationResult};
9use crate::pipeline::{TextCollector, TextFilter, BatchManager, TextItem};
10use crate::storage::CacheManager;
11use markup5ever_rcdom::RcDom;
12use std::collections::HashMap;
13
14#[allow(dead_code)] pub struct HtmlTranslator {
19 config: TranslationConfig,
21
22 text_collector: TextCollector,
24
25 text_filter: TextFilter,
27
28 batch_manager: BatchManager,
30
31 cache_manager: Option<std::sync::Arc<tokio::sync::Mutex<CacheManager>>>,
33
34 #[cfg(feature = "async")]
36 http_client: reqwest::Client,
37
38 stats: TranslationStats,
40}
41
42impl HtmlTranslator {
43 #[cfg(feature = "async")]
45 #[allow(dead_code)] pub async fn new(config: TranslationConfig) -> TranslationResult<Self> {
47 config.validate()?;
48
49 let text_collector = TextCollector::new();
50 let text_filter = TextFilter::new(&config);
51 let batch_manager = BatchManager::new(&config);
52
53 let cache_manager = if config.enable_cache {
54 Some(std::sync::Arc::new(tokio::sync::Mutex::new(CacheManager::new(&config).await?)))
55 } else {
56 None
57 };
58
59 let http_client = reqwest::ClientBuilder::new()
60 .timeout(config.http_timeout())
61 .user_agent(&config.user_agent)
62 .build()
63 .map_err(|e| TranslationError::ConfigError(format!("HTTP客户端初始化失败: {e}")))?;
64
65 Ok(Self {
66 config,
67 text_collector,
68 text_filter,
69 batch_manager,
70 cache_manager,
71 http_client,
72 stats: TranslationStats::default(),
73 })
74 }
75
76 #[cfg(feature = "async")]
78 #[allow(dead_code)] pub async fn translate_html(&mut self, html: &str) -> TranslationResult<String> {
80 tracing::info!("开始翻译HTML内容,长度: {} 字符", html.len());
81
82 let dom = self.parse_html(html)?;
84
85 let translated_dom = self.translate_dom(dom).await?;
87
88 let result = self.serialize_dom(translated_dom)?;
90
91 tracing::info!("HTML翻译完成");
92 Ok(result)
93 }
94
95 #[cfg(feature = "async")]
97 pub async fn translate_file<P: AsRef<std::path::Path>>(
98 &mut self,
99 input_path: P,
100 output_path: P,
101 ) -> TranslationResult<()> {
102 let html_content = std::fs::read_to_string(&input_path)?;
103 let translated_content = self.translate_html(&html_content).await?;
104 std::fs::write(output_path, translated_content)?;
105 Ok(())
106 }
107
108 #[cfg(feature = "async")]
110 pub async fn translate_dom(&mut self, dom: RcDom) -> TranslationResult<RcDom> {
111 let start_time = std::time::Instant::now();
112
113 let texts = self.text_collector.collect_from_dom(&dom)?;
115 tracing::debug!("收集到 {} 个文本项", texts.len());
116 self.stats.texts_collected = texts.len();
117
118 if texts.is_empty() {
119 return Ok(dom);
120 }
121
122 let filtered_texts: Vec<_> = texts
124 .into_iter()
125 .filter(|item| self.text_filter.should_translate(&item.text))
126 .collect();
127 tracing::debug!("过滤后剩余 {} 个文本项", filtered_texts.len());
128 self.stats.texts_filtered = filtered_texts.len();
129
130 if filtered_texts.is_empty() {
131 return Ok(dom);
132 }
133
134 let (cached, uncached) = self.check_cache(&filtered_texts).await?;
136 tracing::debug!("缓存命中 {} 项,需要翻译 {} 项", cached.len(), uncached.len());
137 self.stats.cache_hits = cached.len();
138 self.stats.cache_misses = uncached.len();
139
140 let batches = self.batch_manager.create_batches(uncached);
142 tracing::debug!("创建了 {} 个翻译批次", batches.len());
143 self.stats.batches_created = batches.len();
144
145 let mut all_translations = cached;
147 for batch in batches {
148 let translations = self.translate_batch(&batch.items).await?;
149 all_translations.extend(translations);
150 }
151
152 self.update_cache(&all_translations).await?;
154
155 let updated_dom = self.apply_translations_to_dom(dom, all_translations)?;
157
158 self.stats.processing_time = start_time.elapsed();
159 tracing::info!("DOM翻译完成,耗时: {:?}", self.stats.processing_time);
160
161 Ok(updated_dom)
162 }
163
164 fn parse_html(&self, html: &str) -> TranslationResult<RcDom> {
166 use html5ever::parse_document;
167 use html5ever::tendril::TendrilSink;
168 use markup5ever_rcdom::RcDom;
169
170 let dom = parse_document(RcDom::default(), Default::default())
171 .from_utf8()
172 .read_from(&mut html.as_bytes())
173 .map_err(|e| TranslationError::ParseError(format!("HTML解析失败: {e:?}")))?;
174
175 Ok(dom)
176 }
177
178 fn serialize_dom(&self, dom: RcDom) -> TranslationResult<String> {
180 use html5ever::serialize::{serialize, SerializeOpts};
181 use markup5ever_rcdom::SerializableHandle;
182 use std::io::Cursor;
183
184 let mut buffer = Vec::new();
185 let cursor = Cursor::new(&mut buffer);
186
187 serialize(cursor, &SerializableHandle::from(dom.document.clone()), SerializeOpts::default())
188 .map_err(|e| TranslationError::ParseError(format!("HTML序列化失败: {e:?}")))?;
189
190 String::from_utf8(buffer)
191 .map_err(|e| TranslationError::ParseError(format!("UTF-8转换失败: {e}")))
192 }
193
194 #[cfg(feature = "async")]
196 async fn check_cache(
197 &self,
198 texts: &[TextItem],
199 ) -> TranslationResult<(Vec<(TextItem, String)>, Vec<TextItem>)> {
200 if let Some(cache_arc) = &self.cache_manager {
201 let cache = cache_arc.lock().await;
202 let mut cached = Vec::new();
203 let mut uncached = Vec::new();
204
205 for item in texts {
206 if let Some(translation) = cache.get(&item.text).await? {
207 cached.push((item.clone(), translation));
208 } else {
209 uncached.push(item.clone());
210 }
211 }
212
213 Ok((cached, uncached))
214 } else {
215 Ok((Vec::new(), texts.to_vec()))
216 }
217 }
218
219 #[cfg(feature = "async")]
221 async fn update_cache(&self, translations: &[(TextItem, String)]) -> TranslationResult<()> {
222 if let Some(cache_arc) = &self.cache_manager {
223 let mut cache = cache_arc.lock().await;
224 for (item, translation) in translations {
225 cache.set(&item.text, translation.clone()).await?;
226 }
227 }
228 Ok(())
229 }
230
231 #[cfg(feature = "async")]
233 async fn translate_batch(&self, batch: &[TextItem]) -> TranslationResult<Vec<(TextItem, String)>> {
234 if batch.is_empty() {
235 return Ok(Vec::new());
236 }
237
238 let texts_to_translate: Vec<String> = batch.iter()
240 .enumerate()
241 .map(|(i, item)| {
242 if self.config.use_indexing {
243 format!("[{}]{}", i, item.text)
244 } else {
245 item.text.clone()
246 }
247 })
248 .collect();
249
250 let combined_text = texts_to_translate.join("\n\n");
251
252 let translated_text = self.call_translation_api(&combined_text).await?;
254
255 let translations = self.parse_translation_response(&translated_text, batch.len())?;
257
258 let result: Vec<(TextItem, String)> = batch.iter()
260 .zip(translations.iter())
261 .map(|(item, translation)| (item.clone(), translation.clone()))
262 .collect();
263
264 Ok(result)
265 }
266
267 #[cfg(feature = "async")]
269 async fn call_translation_api(&self, text: &str) -> TranslationResult<String> {
270 let mut attempts = 0;
271 let max_attempts = self.config.max_retries + 1;
272
273 while attempts < max_attempts {
274 let request_body = serde_json::json!({
275 "text": text,
276 "source_lang": self.config.source_language,
277 "target_lang": self.config.target_language
278 });
279
280 let mut request = self.http_client
281 .post(&self.config.api_url)
282 .json(&request_body);
283
284 if let Some(ref api_key) = self.config.api_key {
285 request = request.header("Authorization", format!("Bearer {api_key}"));
286 }
287
288 match request.send().await {
289 Ok(response) => {
290 let status = response.status();
291 if status.is_success() {
292 let result: serde_json::Value = response.json().await?;
293 if let Some(translated) = result.get("data").and_then(|d| d.as_str()) {
294 return Ok(translated.to_string());
295 } else if let Some(translated) = result.get("translated_text").and_then(|t| t.as_str()) {
296 return Ok(translated.to_string());
297 } else {
298 return Err(TranslationError::ServiceError(
299 "翻译响应格式无效".to_string()
300 ));
301 }
302 } else {
303 let error_msg = response.text().await.unwrap_or_else(|_| "未知错误".to_string());
304 return Err(TranslationError::ServiceError(format!(
305 "翻译API返回错误 {status}: {error_msg}"
306 )));
307 }
308 }
309 Err(e) => {
310 attempts += 1;
311 if attempts >= max_attempts {
312 return Err(TranslationError::from(e));
313 }
314
315 tracing::warn!("翻译请求失败,正在重试 ({}/{}): {}", attempts, max_attempts, e);
316 tokio::time::sleep(self.config.retry_delay()).await;
317 }
318 }
319 }
320
321 Err(TranslationError::ServiceError("翻译请求重试失败".to_string()))
322 }
323
324 fn parse_translation_response(&self, response: &str, expected_count: usize) -> TranslationResult<Vec<String>> {
326 let parts: Vec<&str> = response.split("\n\n").collect();
327
328 if self.config.use_indexing {
329 let mut indexed_results = HashMap::new();
331 let index_regex = regex::Regex::new(r"^\[(\d+)\](.*)$").unwrap();
332
333 for part in parts {
334 if let Some(captures) = index_regex.captures(part.trim()) {
335 if let (Some(index_match), Some(text_match)) = (captures.get(1), captures.get(2)) {
336 if let Ok(index) = index_match.as_str().parse::<usize>() {
337 indexed_results.insert(index, text_match.as_str().trim().to_string());
338 }
339 }
340 }
341 }
342
343 let mut results = Vec::with_capacity(expected_count);
344 for i in 0..expected_count {
345 results.push(indexed_results.get(&i).cloned().unwrap_or_default());
346 }
347
348 Ok(results)
349 } else {
350 let results: Vec<String> = parts.iter()
352 .map(|s| s.trim().to_string())
353 .collect();
354
355 if results.len() != expected_count {
356 tracing::warn!("翻译结果数量不匹配: 期望 {}, 得到 {}", expected_count, results.len());
357 }
358
359 Ok(results)
360 }
361 }
362
363 fn apply_translations_to_dom(
365 &self,
366 dom: RcDom,
367 translations: Vec<(TextItem, String)>,
368 ) -> TranslationResult<RcDom> {
369 for (item, translation) in translations {
370 if !translation.trim().is_empty() {
371 item.apply_translation(&translation)?;
372 }
373 }
374
375 Ok(dom)
376 }
377
378 pub fn get_stats(&self) -> &TranslationStats {
380 &self.stats
381 }
382
383 pub fn reset_stats(&mut self) {
385 self.stats = TranslationStats::default();
386 }
387}
388
389#[derive(Debug, Default, Clone)]
391pub struct TranslationStats {
392 pub texts_collected: usize,
394
395 pub texts_filtered: usize,
397
398 pub cache_hits: usize,
400
401 pub cache_misses: usize,
403
404 pub batches_created: usize,
406
407 pub processing_time: std::time::Duration,
409
410 pub api_calls: usize,
412
413 pub errors: usize,
415}
416
417impl TranslationStats {
418 pub fn cache_hit_rate(&self) -> f32 {
420 let total = self.cache_hits + self.cache_misses;
421 if total > 0 {
422 self.cache_hits as f32 / total as f32
423 } else {
424 0.0
425 }
426 }
427
428 pub fn average_batch_size(&self) -> f32 {
430 if self.batches_created > 0 {
431 self.texts_filtered as f32 / self.batches_created as f32
432 } else {
433 0.0
434 }
435 }
436}
437