markdown_translator/
translator.rs

1//! 翻译服务核心模块
2//! 
3//! 提供主要的翻译功能,包括并行处理、速率限制和智能文本分块。
4
5use crate::types::{TranslationConfig, DeepLXRequest, DeepLXResponse, DpTransRequest, RetryConfig, TextSegment};
6use crate::error::{Result, TranslationError};
7use reqwest::Client;
8use std::sync::Arc;
9use std::time::Duration;
10use tokio::sync::Semaphore;
11use tokio::time::sleep;
12
13/// 速率限制器
14/// 
15/// 用于控制API请求频率,防止超出服务提供商的速率限制。
16/// 支持并发请求和自适应延迟。
17#[derive(Clone)]
18pub struct RateLimiter {
19    /// 信号量,用于控制并发请求数量
20    semaphore: Arc<Semaphore>,
21    /// 请求间隔延迟
22    delay: Duration,
23}
24
25impl RateLimiter {
26    /// 创建新的速率限制器
27    /// 
28    /// # 参数
29    /// 
30    /// * `requests_per_second` - 每秒允许的最大请求数
31    /// 
32    /// # 示例
33    /// 
34    /// ```rust
35    /// use markdown_translator::RateLimiter;
36    /// 
37    /// let limiter = RateLimiter::new(1.0); // 每秒1个请求
38    /// ```
39    pub fn new(requests_per_second: f64) -> Self {
40        // 允许更多并发,减少延迟
41        let permits = (requests_per_second * 2.0).ceil() as usize;
42        let delay = Duration::from_millis((500.0 / requests_per_second) as u64); // 减少延迟
43
44        Self {
45            semaphore: Arc::new(Semaphore::new(permits)),
46            delay,
47        }
48    }
49
50    /// 获取请求许可
51    /// 
52    /// 在发起API请求前调用此方法,确保不超过配置的速率限制。
53    /// 
54    /// # 返回
55    /// 
56    /// * `Ok(())` - 成功获取许可
57    /// * `Err(TranslationError)` - 获取许可失败
58    pub async fn acquire(&self) -> Result<()> {
59        let _permit = self.semaphore.acquire().await
60            .map_err(|e| TranslationError::RateLimitError(format!("Rate limiter error: {}", e)))?;
61        // 在并发环境下减少固定延迟
62        if self.delay > Duration::from_millis(100) {
63            sleep(self.delay).await;
64        }
65        Ok(())
66    }
67}
68
69/// 带指数退避的重试机制
70/// 
71/// 为API调用提供可靠的重试机制,在失败时按指数增长的延迟重试。
72/// 
73/// # 参数
74/// 
75/// * `operation` - 要执行的异步操作
76/// * `config` - 重试配置
77/// * `rate_limiter` - 速率限制器
78/// 
79/// # 返回
80/// 
81/// * `Ok(T)` - 操作成功的结果
82/// * `Err(TranslationError)` - 所有重试尝试失败后的错误
83pub async fn retry_with_backoff<F, Fut, T>(
84    mut operation: F,
85    config: &RetryConfig,
86    rate_limiter: &RateLimiter,
87) -> Result<T>
88where
89    F: FnMut() -> Fut,
90    Fut: std::future::Future<Output = Result<T>>,
91{
92    let mut delay = config.initial_delay_ms;
93
94    for attempt in 0..=config.max_retries {
95        rate_limiter.acquire().await?;
96
97        match operation().await {
98            Ok(result) => return Ok(result),
99            Err(e) if attempt == config.max_retries => return Err(e),
100            Err(e) => {
101                eprintln!("Attempt {} failed: {}. Retrying in {}ms...", attempt + 1, e, delay);
102                sleep(Duration::from_millis(delay)).await;
103                delay = std::cmp::min(
104                    (delay as f64 * config.backoff_multiplier) as u64,
105                    config.max_delay_ms,
106                );
107            }
108        }
109    }
110
111    unreachable!()
112}
113
114/// 翻译服务主类
115/// 
116/// 提供完整的翻译功能,包括文本分块、并行处理、代码块跳过等高级特性。
117/// 支持多种翻译API,内置速率限制和错误恢复机制。
118/// 
119/// # 示例
120/// 
121/// ```rust
122/// use markdown_translator::{TranslationService, TranslationConfig};
123/// 
124/// #[tokio::main]
125/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
126///     let config = TranslationConfig::default();
127///     let service = TranslationService::new(config);
128///     
129///     let result = service.translate("Hello, world!").await?;
130///     println!("Translation: {}", result);
131///     
132///     Ok(())
133/// }
134/// ```
135#[derive(Clone)]
136pub struct TranslationService {
137    /// HTTP客户端,用于API调用
138    client: Client,
139    /// 速率限制器
140    rate_limiter: RateLimiter,
141    /// 翻译配置
142    config: TranslationConfig,
143}
144
145impl TranslationService {
146    /// 创建新的翻译服务实例
147    /// 
148    /// # 参数
149    /// 
150    /// * `config` - 翻译配置,包含API地址、语言设置等
151    /// 
152    /// # 示例
153    /// 
154    /// ```rust
155    /// use markdown_translator::{TranslationService, TranslationConfig};
156    /// 
157    /// let config = TranslationConfig {
158    ///     enabled: true,
159    ///     source_lang: "en".to_string(),
160    ///     target_lang: "zh".to_string(),
161    ///     deeplx_api_url: "http://localhost:1188/translate".to_string(),
162    ///     max_requests_per_second: 1.0,
163    ///     max_text_length: 3000,
164    ///     max_paragraphs_per_request: 10,
165    /// };
166    /// 
167    /// let service = TranslationService::new(config);
168    /// ```
169    pub fn new(config: TranslationConfig) -> Self {
170        let client = Client::builder()
171            .timeout(std::time::Duration::from_secs(30))
172            .pool_idle_timeout(std::time::Duration::from_secs(30))
173            .pool_max_idle_per_host(5)
174            .tcp_keepalive(std::time::Duration::from_secs(60))
175            .http1_title_case_headers()
176            .http2_keep_alive_interval(None)
177            .user_agent("Mozilla/5.0 (compatible; MarkdownDownloader/1.0)")
178            .build()
179            .unwrap_or_else(|e| {
180                eprintln!("Failed to create optimized client: {}, using default", e);
181                Client::new()
182            });
183            
184        Self {
185            client,
186            rate_limiter: RateLimiter::new(config.max_requests_per_second),
187            config,
188        }
189    }
190
191    /// 翻译文本
192    /// 
193    /// 主要的翻译接口,支持智能分块、并行处理和代码块跳过。
194    /// 
195    /// # 参数
196    /// 
197    /// * `text` - 要翻译的文本,支持Markdown格式
198    /// 
199    /// # 返回
200    /// 
201    /// * `Ok(String)` - 翻译后的文本
202    /// * `Err(TranslationError)` - 翻译过程中的错误
203    /// 
204    /// # 特性
205    /// 
206    /// - 自动检测并跳过代码块
207    /// - 智能文本分块,支持长文档
208    /// - 并行处理多个文本块
209    /// - 保持Markdown格式
210    /// 
211    /// # 示例
212    /// 
213    /// ```rust
214    /// use markdown_translator::{TranslationService, TranslationConfig};
215    /// 
216    /// #[tokio::main]
217    /// async fn main() -> Result<(), Box<dyn std::error::Error>> {
218    ///     let config = TranslationConfig::default();
219    ///     let service = TranslationService::new(config);
220    ///     
221    ///     let markdown = r#"
222    ///     # Hello World
223    ///     
224    ///     This is a markdown document.
225    ///     
226    ///     ```rust
227    ///     fn main() {
228    ///         println!("Hello, world!");
229    ///     }
230    ///     ```
231    ///     "#;
232    ///     
233    ///     let translated = service.translate(markdown).await?;
234    ///     println!("{}", translated);
235    ///     
236    ///     Ok(())
237    /// }
238    /// ```
239    pub async fn translate(&self, text: &str) -> Result<String> {
240        if !self.config.enabled {
241            return Ok(text.to_string());
242        }
243
244        println!("文本总长度: {} 字符", text.len());
245
246        if text.len() <= self.config.max_text_length {
247            println!("文本较短,直接翻译");
248            return self.translate_chunk(text).await;
249        }
250
251        let chunks = self.split_text_into_chunks(text);
252        println!("文本较长,分为 {} 块进行翻译", chunks.len());
253
254        let mut translated_chunks = Vec::new();
255
256        // 并行处理所有块
257        let mut futures = Vec::new();
258        
259        for (i, chunk) in chunks.iter().enumerate() {
260            println!("准备翻译第 {} 块,长度: {} 字符", i + 1, chunk.len());
261            
262            if self.is_code_block_chunk(chunk) {
263                // 代码块直接返回结果
264                let result = chunk.strip_prefix("__CODE_BLOCK__").unwrap_or(chunk).to_string();
265                futures.push(Box::pin(async move { Ok(result) }) as std::pin::Pin<Box<dyn std::future::Future<Output = Result<String>> + Send>>);
266            } else {
267                // 翻译任务
268                let chunk_clone = chunk.clone();
269                let translator_clone = self.clone();
270                futures.push(Box::pin(async move {
271                    translator_clone.translate_chunk(&chunk_clone).await
272                }) as std::pin::Pin<Box<dyn std::future::Future<Output = Result<String>> + Send>>);
273            }
274        }
275
276        // 并发执行所有翻译任务,但有并发限制
277        let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(5)); // 最多5个并发请求
278        let mut handles = Vec::new();
279        
280        for (i, future) in futures.into_iter().enumerate() {
281            let semaphore_clone = semaphore.clone();
282            let handle = tokio::spawn(async move {
283                let _permit = semaphore_clone.acquire().await.unwrap();
284                println!("开始翻译第 {} 块", i + 1);
285                let result = future.await;
286                println!("完成翻译第 {} 块", i + 1);
287                result
288            });
289            handles.push(handle);
290        }
291
292        // 收集所有结果
293        for handle in handles {
294            let result = handle.await.map_err(|e| TranslationError::Custom(e.to_string()))??;
295            translated_chunks.push(result);
296        }
297
298        Ok(translated_chunks.join("\n\n"))
299    }
300
301    fn split_text_into_chunks(&self, text: &str) -> Vec<String> {
302        let mut chunks = Vec::new();
303        let max_length = self.config.max_text_length;
304
305        if text.len() <= max_length {
306            chunks.push(text.to_string());
307            return chunks;
308        }
309
310        let protected_sections = self.identify_code_blocks(text);
311        let segments = self.split_by_code_blocks(text, &protected_sections);
312
313        let mut current_chunk = String::new();
314
315        for segment in segments {
316            if segment.is_code_block {
317                // 代码块需要特殊处理 - 直接作为独立块处理,不与其他内容合并
318                if !current_chunk.is_empty() {
319                    chunks.push(current_chunk.clone());
320                    current_chunk.clear();
321                }
322                // 给代码块添加特殊标记,便于后续识别
323                chunks.push(format!("__CODE_BLOCK__{}", segment.content));
324            } else {
325                let paragraphs = self.split_text_by_empty_lines(&segment.content);
326                
327                for paragraph in paragraphs {
328                    if paragraph.trim().is_empty() {
329                        continue;
330                    }
331
332                    let potential_length = if current_chunk.is_empty() {
333                        paragraph.len()
334                    } else {
335                        current_chunk.len() + 2 + paragraph.len()
336                    };
337
338                    if potential_length <= max_length {
339                        if !current_chunk.is_empty() {
340                            current_chunk.push_str("\n\n");
341                        }
342                        current_chunk.push_str(&paragraph);
343                    } else {
344                        if !current_chunk.is_empty() {
345                            chunks.push(current_chunk.clone());
346                            current_chunk.clear();
347                        }
348
349                        if paragraph.len() > max_length {
350                            let sub_chunks = self.split_long_paragraph(&paragraph, max_length);
351                            chunks.extend(sub_chunks);
352                        } else {
353                            current_chunk = paragraph;
354                        }
355                    }
356                }
357            }
358        }
359
360        if !current_chunk.is_empty() {
361            chunks.push(current_chunk);
362        }
363
364        if chunks.is_empty() {
365            chunks.push(text.to_string());
366        }
367
368        chunks
369    }
370
371    fn identify_code_blocks(&self, text: &str) -> Vec<(usize, usize)> {
372        let mut code_blocks = Vec::new();
373        let mut in_code_block = false;
374        let mut current_start = 0;
375        
376        let lines: Vec<&str> = text.lines().collect();
377        let mut char_pos = 0;
378        
379        for (_i, line) in lines.iter().enumerate() {
380            if line.starts_with("```") {
381                if in_code_block {
382                    let end_pos = char_pos + line.len();
383                    code_blocks.push((current_start, end_pos));
384                    in_code_block = false;
385                } else {
386                    current_start = char_pos;
387                    in_code_block = true;
388                }
389            }
390            char_pos += line.len() + 1;
391        }
392        
393        if in_code_block {
394            code_blocks.push((current_start, text.len()));
395        }
396        
397        code_blocks
398    }
399
400    fn split_by_code_blocks(&self, text: &str, code_blocks: &[(usize, usize)]) -> Vec<TextSegment> {
401        let mut segments = Vec::new();
402        let mut last_end = 0;
403        
404        for &(start, end) in code_blocks {
405            if start > last_end {
406                let content = text[last_end..start].to_string();
407                if !content.trim().is_empty() {
408                    segments.push(TextSegment {
409                        content,
410                        is_code_block: false,
411                    });
412                }
413            }
414            
415            let content = text[start..end].to_string();
416            segments.push(TextSegment {
417                content,
418                is_code_block: true,
419            });
420            
421            last_end = end;
422        }
423        
424        if last_end < text.len() {
425            let content = text[last_end..].to_string();
426            if !content.trim().is_empty() {
427                segments.push(TextSegment {
428                    content,
429                    is_code_block: false,
430                });
431            }
432        }
433        
434        if segments.is_empty() {
435            segments.push(TextSegment {
436                content: text.to_string(),
437                is_code_block: false,
438            });
439        }
440        
441        segments
442    }
443
444    fn split_text_by_empty_lines(&self, text: &str) -> Vec<String> {
445        let max_length = self.config.max_text_length;
446        
447        if text.len() <= max_length {
448            return vec![text.to_string()];
449        }
450        
451        let paragraphs: Vec<&str> = text.split("\n\n").collect();
452        let mut result = Vec::new();
453        let mut current_group = Vec::new();
454        let mut current_length = 0;
455        
456        for paragraph in paragraphs {
457            let paragraph = paragraph.trim();
458            if paragraph.is_empty() {
459                continue;
460            }
461            
462            let para_len = paragraph.len();
463            
464            let potential_length = if current_group.is_empty() {
465                para_len
466            } else {
467                current_length + 2 + para_len
468            };
469            
470            if potential_length <= max_length {
471                current_group.push(paragraph);
472                current_length = potential_length;
473            } else {
474                if !current_group.is_empty() {
475                    result.push(current_group.join("\n\n"));
476                    current_group.clear();
477                }
478                
479                if para_len > max_length {
480                    let sub_parts = self.split_long_paragraph(paragraph, max_length);
481                    result.extend(sub_parts);
482                    current_length = 0;
483                } else {
484                    current_group.push(paragraph);
485                    current_length = para_len;
486                }
487            }
488        }
489        
490        if !current_group.is_empty() {
491            result.push(current_group.join("\n\n"));
492        }
493        
494        result
495    }
496
497    fn split_long_paragraph(&self, paragraph: &str, max_length: usize) -> Vec<String> {
498        let mut chunks = Vec::new();
499        let mut start = 0;
500
501        while start < paragraph.len() {
502            let end = std::cmp::min(start + max_length, paragraph.len());
503            let mut actual_end = end;
504
505            if end < paragraph.len() {
506                for i in (start..end).rev() {
507                    let ch = paragraph.chars().nth(i).unwrap_or(' ');
508                    if ch == '.' || ch == '!' || ch == '?' || ch == '。' || ch == '!' || ch == '?' {
509                        actual_end = i + 1;
510                        break;
511                    }
512                }
513
514                if actual_end == end {
515                    for i in (start..end).rev() {
516                        let ch = paragraph.chars().nth(i).unwrap_or(' ');
517                        if ch == ' ' || ch == '\n' || ch == '\t' {
518                            actual_end = i + 1;
519                            break;
520                        }
521                    }
522                }
523
524                if actual_end == end && end - start < max_length / 2 {
525                    actual_end = end;
526                }
527            }
528
529            let chunk = paragraph[start..actual_end].trim().to_string();
530            if !chunk.is_empty() {
531                chunks.push(chunk);
532            }
533
534            start = actual_end;
535        }
536
537        chunks
538    }
539
540    async fn translate_chunk(&self, text: &str) -> Result<String> {
541        println!("发送翻译请求到: {}", self.config.deeplx_api_url);
542        println!("翻译文本长度: {} 字符", text.len());
543
544        let retry_config = RetryConfig::default();
545        let client = &self.client;
546        let config = &self.config;
547        let text_clone = text.to_string();
548
549        let result = retry_with_backoff(
550            || {
551                let client = client.clone();
552                let config = config.clone();
553                let text = text_clone.clone();
554
555                Box::pin(async move {
556                    let response = if config.deeplx_api_url.contains("dptrans") {
557                        println!("使用dptrans API格式请求");
558
559                        let request = DpTransRequest {
560                            text: text.clone(),
561                            source_lang: if config.source_lang == "auto" { "auto".to_string() } else { config.source_lang.clone() },
562                            target_lang: config.target_lang.clone(),
563                        };
564
565                        client
566                            .post(&config.deeplx_api_url)
567                            .header("Content-Type", "application/json")
568                            .header("Accept", "application/json, text/plain, */*")
569                            .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
570                            .json(&request)
571                            .send()
572                            .await
573                            .map_err(|e| {
574                                TranslationError::Custom(format!("DeepLX网络请求失败: {}", e))
575                            })?
576                    } else {
577                        println!("使用标准DeepLX API格式请求");
578
579                        let request = DeepLXRequest {
580                            text: text.clone(),
581                            source_lang: config.source_lang.clone(),
582                            target_lang: config.target_lang.clone(),
583                        };
584
585                        client
586                            .post(&config.deeplx_api_url)
587                            .header("Content-Type", "application/json")
588                            .header("Accept", "application/json")
589                            .json(&request)
590                            .send()
591                            .await
592                            .map_err(|e| {
593                                TranslationError::Custom(format!("DeepLX网络请求失败: {}", e))
594                            })?
595                    };
596
597                    let status = response.status();
598                    println!("DeepLX响应状态: {}", status);
599
600                    if response.status().is_success() {
601                        let response_text = response
602                            .text()
603                            .await
604                            .map_err(|e| TranslationError::Custom(format!("读取响应文本失败: {}", e)))?;
605
606                        if let Ok(result) = serde_json::from_str::<DeepLXResponse>(&response_text) {
607                            if result.code == 200 {
608                                if result.data.is_empty() {
609                                    Err(TranslationError::Custom("DeepLX返回了空的翻译结果".to_string()))
610                                } else {
611                                    Ok(result.data)
612                                }
613                            } else {
614                                Err(TranslationError::ApiError {
615                                    code: result.code,
616                                    message: format!("DeepLX翻译失败,返回代码: {}", result.code)
617                                })
618                            }
619                        } else {
620                            if response_text.trim().is_empty() {
621                                Err(TranslationError::Custom("API返回了空的翻译结果".to_string()))
622                            } else if response_text.starts_with("{") {
623                                if let Ok(json_value) = serde_json::from_str::<serde_json::Value>(&response_text) {
624                                    if let Some(translated) = json_value
625                                        .get("translated_text")
626                                        .or_else(|| json_value.get("result"))
627                                        .or_else(|| json_value.get("translation"))
628                                        .or_else(|| json_value.get("data"))
629                                        .and_then(|v| v.as_str())
630                                    {
631                                        Ok(translated.to_string())
632                                    } else {
633                                        Err(TranslationError::ParseError(format!(
634                                            "无法从JSON响应中提取翻译结果: {}",
635                                            response_text
636                                        )))
637                                    }
638                                } else {
639                                    Err(TranslationError::ParseError(format!("无法解析JSON响应: {}", response_text)))
640                                }
641                            } else {
642                                println!("假设响应是纯文本翻译结果");
643                                Ok(response_text)
644                            }
645                        }
646                    } else {
647                        let error_text = response
648                            .text()
649                            .await
650                            .unwrap_or_else(|_| "无法读取错误信息".to_string());
651                        Err(TranslationError::ApiError {
652                            code: status.as_u16() as i32,
653                            message: format!("DeepLX API请求失败: {} - {}", status, error_text)
654                        })
655                    }
656                })
657            },
658            &retry_config,
659            &self.rate_limiter,
660        )
661        .await?;
662
663        Ok(result)
664    }
665
666    /// 检测chunk是否为代码块
667    fn is_code_block_chunk(&self, chunk: &str) -> bool {
668        chunk.starts_with("__CODE_BLOCK__") || chunk.trim_start().starts_with("```")
669    }
670}