markdown_translator/
translator.rs

1//! 翻译服务核心模块
2//!
3//! 提供主要的翻译功能,包括并行处理、速率限制和智能文本分块。
4
5use crate::error::{Result, TranslationError};
6use crate::types::{
7    DeepLXRequest, DeepLXResponse, DpTransRequest, RetryConfig, TextSegment, TranslationConfig,
8};
9use reqwest::Client;
10use std::sync::Arc;
11use std::time::Duration;
12use tokio::sync::Semaphore;
13use tokio::time::sleep;
14
15/// 速率限制器
16///
17/// 用于控制API请求频率,防止超出服务提供商的速率限制。
18/// 支持并发请求和自适应延迟。
19#[derive(Clone)]
20pub struct RateLimiter {
21    /// 信号量,用于控制并发请求数量
22    semaphore: Arc<Semaphore>,
23    /// 请求间隔延迟
24    delay: Duration,
25}
26
27impl RateLimiter {
28    /// 创建新的速率限制器
29    ///
30    /// # 参数
31    ///
32    /// * `requests_per_second` - 每秒允许的最大请求数
33    ///
34    /// # 示例
35    ///
36    /// ```rust
37    /// use markdown_translator::RateLimiter;
38    ///
39    /// let limiter = RateLimiter::new(1.0); // 每秒1个请求
40    /// ```
41    pub fn new(requests_per_second: f64) -> Self {
42        // 允许更多并发,减少延迟
43        let permits = (requests_per_second * 2.0).ceil() as usize;
44        let delay = Duration::from_millis((500.0 / requests_per_second) as u64); // 减少延迟
45
46        Self {
47            semaphore: Arc::new(Semaphore::new(permits)),
48            delay,
49        }
50    }
51
52    /// 获取请求许可
53    ///
54    /// 在发起API请求前调用此方法,确保不超过配置的速率限制。
55    ///
56    /// # 返回
57    ///
58    /// * `Ok(())` - 成功获取许可
59    /// * `Err(TranslationError)` - 获取许可失败
60    pub async fn acquire(&self) -> Result<()> {
61        let _permit =
62            self.semaphore.acquire().await.map_err(|e| {
63                TranslationError::RateLimitError(format!("Rate limiter error: {}", e))
64            })?;
65        // 在并发环境下减少固定延迟
66        if self.delay > Duration::from_millis(100) {
67            sleep(self.delay).await;
68        }
69        Ok(())
70    }
71}
72
73/// 带指数退避的重试机制
74///
75/// 为API调用提供可靠的重试机制,在失败时按指数增长的延迟重试。
76///
77/// # 参数
78///
79/// * `operation` - 要执行的异步操作
80/// * `config` - 重试配置
81/// * `rate_limiter` - 速率限制器
82///
83/// # 返回
84///
85/// * `Ok(T)` - 操作成功的结果
86/// * `Err(TranslationError)` - 所有重试尝试失败后的错误
87pub async fn retry_with_backoff<F, Fut, T>(
88    mut operation: F,
89    config: &RetryConfig,
90    rate_limiter: &RateLimiter,
91) -> Result<T>
92where
93    F: FnMut() -> Fut,
94    Fut: std::future::Future<Output = Result<T>>,
95{
96    let mut delay = config.initial_delay_ms;
97
98    for attempt in 0..=config.max_retries {
99        rate_limiter.acquire().await?;
100
101        match operation().await {
102            Ok(result) => return Ok(result),
103            Err(e) if attempt == config.max_retries => return Err(e),
104            Err(e) => {
105                eprintln!(
106                    "Attempt {} failed: {}. Retrying in {}ms...",
107                    attempt + 1,
108                    e,
109                    delay
110                );
111                sleep(Duration::from_millis(delay)).await;
112                delay = std::cmp::min(
113                    (delay as f64 * config.backoff_multiplier) as u64,
114                    config.max_delay_ms,
115                );
116            }
117        }
118    }
119
120    unreachable!()
121}
122
123/// 翻译服务主类
124///
125/// 提供完整的翻译功能,包括文本分块、并行处理、代码块跳过等高级特性。
126/// 支持多种翻译API,内置速率限制和错误恢复机制。
127///
128/// # 示例
129///
130/// ```rust
131/// use markdown_translator::{TranslationService, TranslationConfig};
132///
133/// #[tokio::main]
134/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
135///     let config = TranslationConfig::default();
136///     let service = TranslationService::new(config);
137///     
138///     let result = service.translate("Hello, world!").await?;
139///     println!("Translation: {}", result);
140///     
141///     Ok(())
142/// }
143/// ```
144#[derive(Clone)]
145pub struct TranslationService {
146    /// HTTP客户端,用于API调用
147    client: Client,
148    /// 速率限制器
149    rate_limiter: RateLimiter,
150    /// 翻译配置
151    config: TranslationConfig,
152}
153
154impl TranslationService {
155    /// 创建新的翻译服务实例
156    ///
157    /// # 参数
158    ///
159    /// * `config` - 翻译配置,包含API地址、语言设置等
160    ///
161    /// # 示例
162    ///
163    /// ```rust
164    /// use markdown_translator::{TranslationService, TranslationConfig};
165    ///
166    /// let config = TranslationConfig {
167    ///     enabled: true,
168    ///     source_lang: "en".to_string(),
169    ///     target_lang: "zh".to_string(),
170    ///     deeplx_api_url: "http://localhost:1188/translate".to_string(),
171    ///     max_requests_per_second: 1.0,
172    ///     max_text_length: 3000,
173    ///     max_paragraphs_per_request: 10,
174    /// };
175    ///
176    /// let service = TranslationService::new(config);
177    /// ```
178    pub fn new(config: TranslationConfig) -> Self {
179        let client = Client::builder()
180            .timeout(std::time::Duration::from_secs(30))
181            .pool_idle_timeout(std::time::Duration::from_secs(30))
182            .pool_max_idle_per_host(5)
183            .tcp_keepalive(std::time::Duration::from_secs(60))
184            .http1_title_case_headers()
185            .http2_keep_alive_interval(None)
186            .user_agent("Mozilla/5.0 (compatible; MarkdownDownloader/1.0)")
187            .build()
188            .unwrap_or_else(|e| {
189                eprintln!("Failed to create optimized client: {}, using default", e);
190                Client::new()
191            });
192
193        Self {
194            client,
195            rate_limiter: RateLimiter::new(config.max_requests_per_second),
196            config,
197        }
198    }
199
200    /// 翻译文本
201    ///
202    /// 主要的翻译接口,支持智能分块、并行处理和代码块跳过。
203    ///
204    /// # 参数
205    ///
206    /// * `text` - 要翻译的文本,支持Markdown格式
207    ///
208    /// # 返回
209    ///
210    /// * `Ok(String)` - 翻译后的文本
211    /// * `Err(TranslationError)` - 翻译过程中的错误
212    ///
213    /// # 特性
214    ///
215    /// - 自动检测并跳过代码块
216    /// - 智能文本分块,支持长文档
217    /// - 并行处理多个文本块
218    /// - 保持Markdown格式
219    ///
220    /// # 示例
221    ///
222    /// ```rust
223    /// use markdown_translator::{TranslationService, TranslationConfig};
224    ///
225    /// #[tokio::main]
226    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
227    ///     let config = TranslationConfig::default();
228    ///     let service = TranslationService::new(config);
229    ///     
230    ///     let markdown = r#"
231    ///     # Hello World
232    ///     
233    ///     This is a markdown document.
234    ///     
235    ///     ```rust
236    ///     fn main() {
237    ///         println!("Hello, world!");
238    ///     }
239    ///     ```
240    ///     "#;
241    ///     
242    ///     let translated = service.translate(markdown).await?;
243    ///     println!("{}", translated);
244    ///     
245    ///     Ok(())
246    /// }
247    /// ```
248    pub async fn translate(&self, text: &str) -> Result<String> {
249        if !self.config.enabled {
250            return Ok(text.to_string());
251        }
252
253        println!("文本总长度: {} 字符", text.len());
254
255        if text.len() <= self.config.max_text_length {
256            println!("文本较短,直接翻译");
257            return self.translate_chunk(text).await;
258        }
259
260        let chunks = self.split_text_into_chunks(text);
261        println!("文本较长,分为 {} 块进行翻译", chunks.len());
262
263        let mut translated_chunks = Vec::new();
264
265        // 并行处理所有块
266        let mut futures = Vec::new();
267
268        for (i, chunk) in chunks.iter().enumerate() {
269            println!("准备翻译第 {} 块,长度: {} 字符", i + 1, chunk.len());
270
271            if self.is_code_block_chunk(chunk) {
272                // 代码块直接返回结果
273                let result = chunk
274                    .strip_prefix("__CODE_BLOCK__")
275                    .unwrap_or(chunk)
276                    .to_string();
277                futures.push(Box::pin(async move { Ok(result) })
278                    as std::pin::Pin<
279                        Box<dyn std::future::Future<Output = Result<String>> + Send>,
280                    >);
281            } else {
282                // 翻译任务
283                let chunk_clone = chunk.clone();
284                let translator_clone = self.clone();
285                futures.push(Box::pin(async move {
286                    translator_clone.translate_chunk(&chunk_clone).await
287                })
288                    as std::pin::Pin<
289                        Box<dyn std::future::Future<Output = Result<String>> + Send>,
290                    >);
291            }
292        }
293
294        // 并发执行所有翻译任务,但有并发限制
295        let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(5)); // 最多5个并发请求
296        let mut handles = Vec::new();
297
298        for (i, future) in futures.into_iter().enumerate() {
299            let semaphore_clone = semaphore.clone();
300            let handle = tokio::spawn(async move {
301                let _permit = semaphore_clone.acquire().await.unwrap();
302                println!("开始翻译第 {} 块", i + 1);
303                let result = future.await;
304                println!("完成翻译第 {} 块", i + 1);
305                result
306            });
307            handles.push(handle);
308        }
309
310        // 收集所有结果
311        for handle in handles {
312            let result = handle
313                .await
314                .map_err(|e| TranslationError::Custom(e.to_string()))??;
315            translated_chunks.push(result);
316        }
317
318        Ok(translated_chunks.join("\n\n"))
319    }
320
321    fn split_text_into_chunks(&self, text: &str) -> Vec<String> {
322        let mut chunks = Vec::new();
323        let max_length = self.config.max_text_length;
324
325        if text.len() <= max_length {
326            chunks.push(text.to_string());
327            return chunks;
328        }
329
330        let protected_sections = self.identify_code_blocks(text);
331        let segments = self.split_by_code_blocks(text, &protected_sections);
332
333        let mut current_chunk = String::new();
334
335        for segment in segments {
336            if segment.is_code_block {
337                // 代码块需要特殊处理 - 直接作为独立块处理,不与其他内容合并
338                if !current_chunk.is_empty() {
339                    chunks.push(current_chunk.clone());
340                    current_chunk.clear();
341                }
342                // 给代码块添加特殊标记,便于后续识别
343                chunks.push(format!("__CODE_BLOCK__{}", segment.content));
344            } else {
345                let paragraphs = self.split_text_by_empty_lines(&segment.content);
346
347                for paragraph in paragraphs {
348                    if paragraph.trim().is_empty() {
349                        continue;
350                    }
351
352                    let potential_length = if current_chunk.is_empty() {
353                        paragraph.len()
354                    } else {
355                        current_chunk.len() + 2 + paragraph.len()
356                    };
357
358                    if potential_length <= max_length {
359                        if !current_chunk.is_empty() {
360                            current_chunk.push_str("\n\n");
361                        }
362                        current_chunk.push_str(&paragraph);
363                    } else {
364                        if !current_chunk.is_empty() {
365                            chunks.push(current_chunk.clone());
366                            current_chunk.clear();
367                        }
368
369                        if paragraph.len() > max_length {
370                            let sub_chunks = self.split_long_paragraph(&paragraph, max_length);
371                            chunks.extend(sub_chunks);
372                        } else {
373                            current_chunk = paragraph;
374                        }
375                    }
376                }
377            }
378        }
379
380        if !current_chunk.is_empty() {
381            chunks.push(current_chunk);
382        }
383
384        if chunks.is_empty() {
385            chunks.push(text.to_string());
386        }
387
388        chunks
389    }
390
391    fn identify_code_blocks(&self, text: &str) -> Vec<(usize, usize)> {
392        let mut code_blocks = Vec::new();
393        let mut in_code_block = false;
394        let mut current_start = 0;
395
396        let lines: Vec<&str> = text.lines().collect();
397        let mut char_pos = 0;
398
399        for (_i, line) in lines.iter().enumerate() {
400            if line.starts_with("```") {
401                if in_code_block {
402                    let end_pos = char_pos + line.len();
403                    code_blocks.push((current_start, end_pos));
404                    in_code_block = false;
405                } else {
406                    current_start = char_pos;
407                    in_code_block = true;
408                }
409            }
410            char_pos += line.len() + 1;
411        }
412
413        if in_code_block {
414            code_blocks.push((current_start, text.len()));
415        }
416
417        code_blocks
418    }
419
420    fn split_by_code_blocks(&self, text: &str, code_blocks: &[(usize, usize)]) -> Vec<TextSegment> {
421        let mut segments = Vec::new();
422        let mut last_end = 0;
423
424        for &(start, end) in code_blocks {
425            if start > last_end {
426                let content = text[last_end..start].to_string();
427                if !content.trim().is_empty() {
428                    segments.push(TextSegment {
429                        content,
430                        is_code_block: false,
431                    });
432                }
433            }
434
435            let content = text[start..end].to_string();
436            segments.push(TextSegment {
437                content,
438                is_code_block: true,
439            });
440
441            last_end = end;
442        }
443
444        if last_end < text.len() {
445            let content = text[last_end..].to_string();
446            if !content.trim().is_empty() {
447                segments.push(TextSegment {
448                    content,
449                    is_code_block: false,
450                });
451            }
452        }
453
454        if segments.is_empty() {
455            segments.push(TextSegment {
456                content: text.to_string(),
457                is_code_block: false,
458            });
459        }
460
461        segments
462    }
463
464    fn split_text_by_empty_lines(&self, text: &str) -> Vec<String> {
465        let max_length = self.config.max_text_length;
466
467        if text.len() <= max_length {
468            return vec![text.to_string()];
469        }
470
471        let paragraphs: Vec<&str> = text.split("\n\n").collect();
472        let mut result = Vec::new();
473        let mut current_group = Vec::new();
474        let mut current_length = 0;
475
476        for paragraph in paragraphs {
477            let paragraph = paragraph.trim();
478            if paragraph.is_empty() {
479                continue;
480            }
481
482            let para_len = paragraph.len();
483
484            let potential_length = if current_group.is_empty() {
485                para_len
486            } else {
487                current_length + 2 + para_len
488            };
489
490            if potential_length <= max_length {
491                current_group.push(paragraph);
492                current_length = potential_length;
493            } else {
494                if !current_group.is_empty() {
495                    result.push(current_group.join("\n\n"));
496                    current_group.clear();
497                }
498
499                if para_len > max_length {
500                    let sub_parts = self.split_long_paragraph(paragraph, max_length);
501                    result.extend(sub_parts);
502                    current_length = 0;
503                } else {
504                    current_group.push(paragraph);
505                    current_length = para_len;
506                }
507            }
508        }
509
510        if !current_group.is_empty() {
511            result.push(current_group.join("\n\n"));
512        }
513
514        result
515    }
516
517    fn split_long_paragraph(&self, paragraph: &str, max_length: usize) -> Vec<String> {
518        let mut chunks = Vec::new();
519        let mut start = 0;
520
521        while start < paragraph.len() {
522            let end = std::cmp::min(start + max_length, paragraph.len());
523            let mut actual_end = end;
524
525            if end < paragraph.len() {
526                for i in (start..end).rev() {
527                    let ch = paragraph.chars().nth(i).unwrap_or(' ');
528                    if ch == '.' || ch == '!' || ch == '?' || ch == '。' || ch == '!' || ch == '?'
529                    {
530                        actual_end = i + 1;
531                        break;
532                    }
533                }
534
535                if actual_end == end {
536                    for i in (start..end).rev() {
537                        let ch = paragraph.chars().nth(i).unwrap_or(' ');
538                        if ch == ' ' || ch == '\n' || ch == '\t' {
539                            actual_end = i + 1;
540                            break;
541                        }
542                    }
543                }
544
545                if actual_end == end && end - start < max_length / 2 {
546                    actual_end = end;
547                }
548            }
549
550            let chunk = paragraph[start..actual_end].trim().to_string();
551            if !chunk.is_empty() {
552                chunks.push(chunk);
553            }
554
555            start = actual_end;
556        }
557
558        chunks
559    }
560
561    async fn translate_chunk(&self, text: &str) -> Result<String> {
562        println!("发送翻译请求到: {}", self.config.deeplx_api_url);
563        println!("翻译文本长度: {} 字符", text.len());
564
565        let retry_config = RetryConfig::default();
566        let client = &self.client;
567        let config = &self.config;
568        let text_clone = text.to_string();
569
570        let result = retry_with_backoff(
571            || {
572                let client = client.clone();
573                let config = config.clone();
574                let text = text_clone.clone();
575
576                Box::pin(async move {
577                    let response = if config.deeplx_api_url.contains("dptrans") {
578                        println!("使用dptrans API格式请求");
579
580                        let request = DpTransRequest {
581                            text: text.clone(),
582                            source_lang: if config.source_lang == "auto" {
583                                "auto".to_string()
584                            } else {
585                                config.source_lang.clone()
586                            },
587                            target_lang: config.target_lang.clone(),
588                        };
589
590                        client
591                            .post(&config.deeplx_api_url)
592                            .header("Content-Type", "application/json")
593                            .header("Accept", "application/json, text/plain, */*")
594                            .header(
595                                "User-Agent",
596                                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
597                            )
598                            .json(&request)
599                            .send()
600                            .await
601                            .map_err(|e| {
602                                TranslationError::Custom(format!("DeepLX网络请求失败: {}", e))
603                            })?
604                    } else {
605                        println!("使用标准DeepLX API格式请求");
606
607                        let request = DeepLXRequest {
608                            text: text.clone(),
609                            source_lang: config.source_lang.clone(),
610                            target_lang: config.target_lang.clone(),
611                        };
612
613                        client
614                            .post(&config.deeplx_api_url)
615                            .header("Content-Type", "application/json")
616                            .header("Accept", "application/json")
617                            .json(&request)
618                            .send()
619                            .await
620                            .map_err(|e| {
621                                TranslationError::Custom(format!("DeepLX网络请求失败: {}", e))
622                            })?
623                    };
624
625                    let status = response.status();
626                    println!("DeepLX响应状态: {}", status);
627
628                    if response.status().is_success() {
629                        let response_text = response.text().await.map_err(|e| {
630                            TranslationError::Custom(format!("读取响应文本失败: {}", e))
631                        })?;
632
633                        if let Ok(result) = serde_json::from_str::<DeepLXResponse>(&response_text) {
634                            if result.code == 200 {
635                                if result.data.is_empty() {
636                                    Err(TranslationError::Custom(
637                                        "DeepLX返回了空的翻译结果".to_string(),
638                                    ))
639                                } else {
640                                    Ok(result.data)
641                                }
642                            } else {
643                                Err(TranslationError::ApiError {
644                                    code: result.code,
645                                    message: format!("DeepLX翻译失败,返回代码: {}", result.code),
646                                })
647                            }
648                        } else {
649                            if response_text.trim().is_empty() {
650                                Err(TranslationError::Custom(
651                                    "API返回了空的翻译结果".to_string(),
652                                ))
653                            } else if response_text.starts_with("{") {
654                                if let Ok(json_value) =
655                                    serde_json::from_str::<serde_json::Value>(&response_text)
656                                {
657                                    if let Some(translated) = json_value
658                                        .get("translated_text")
659                                        .or_else(|| json_value.get("result"))
660                                        .or_else(|| json_value.get("translation"))
661                                        .or_else(|| json_value.get("data"))
662                                        .and_then(|v| v.as_str())
663                                    {
664                                        Ok(translated.to_string())
665                                    } else {
666                                        Err(TranslationError::ParseError(format!(
667                                            "无法从JSON响应中提取翻译结果: {}",
668                                            response_text
669                                        )))
670                                    }
671                                } else {
672                                    Err(TranslationError::ParseError(format!(
673                                        "无法解析JSON响应: {}",
674                                        response_text
675                                    )))
676                                }
677                            } else {
678                                println!("假设响应是纯文本翻译结果");
679                                Ok(response_text)
680                            }
681                        }
682                    } else {
683                        let error_text = response
684                            .text()
685                            .await
686                            .unwrap_or_else(|_| "无法读取错误信息".to_string());
687                        Err(TranslationError::ApiError {
688                            code: status.as_u16() as i32,
689                            message: format!("DeepLX API请求失败: {} - {}", status, error_text),
690                        })
691                    }
692                })
693            },
694            &retry_config,
695            &self.rate_limiter,
696        )
697        .await?;
698
699        Ok(result)
700    }
701
702    /// 检测chunk是否为代码块
703    fn is_code_block_chunk(&self, chunk: &str) -> bool {
704        chunk.starts_with("__CODE_BLOCK__") || chunk.trim_start().starts_with("```")
705    }
706}