html_translation_lib/pipeline/
batch.rs

1//! 批次管理模块
2//!
3//! 管理翻译文本的批次处理,优化API调用效率
4
5use crate::config::TranslationConfig;
6use crate::pipeline::collector::TextItem;
7
8/// 批次管理器
9pub struct BatchManager {
10    /// 批次大小
11    batch_size: usize,
12    
13    /// 最大字符数限制
14    max_chars_per_batch: usize,
15    
16    /// 是否启用智能分批
17    smart_batching: bool,
18}
19
20impl BatchManager {
21    /// 创建新的批次管理器
22    pub fn new(config: &TranslationConfig) -> Self {
23        Self {
24            batch_size: config.batch_size,
25            max_chars_per_batch: 8000, // API通常有字符限制
26            smart_batching: true,
27        }
28    }
29    
30    /// 创建翻译批次
31    pub fn create_batches(&self, items: Vec<TextItem>) -> Vec<Batch> {
32        if items.is_empty() {
33            return Vec::new();
34        }
35        
36        if self.smart_batching {
37            self.create_smart_batches(items)
38        } else {
39            self.create_simple_batches(items)
40        }
41    }
42    
43    /// 创建智能批次(考虑字符数限制)
44    fn create_smart_batches(&self, items: Vec<TextItem>) -> Vec<Batch> {
45        let mut batches = Vec::new();
46        let mut current_batch = Vec::new();
47        let mut current_chars = 0;
48        
49        for item in items {
50            let item_chars = item.text.len();
51            
52            // 检查是否需要开始新批次
53            if !current_batch.is_empty() && 
54               (current_batch.len() >= self.batch_size || 
55                current_chars + item_chars > self.max_chars_per_batch) {
56                
57                batches.push(Batch::new(
58                    std::mem::take(&mut current_batch),
59                    current_chars,
60                    BatchType::Smart,
61                    BatchPriority::Normal,
62                ));
63                current_chars = 0;
64            }
65            
66            current_batch.push(item);
67            current_chars += item_chars;
68        }
69        
70        // 添加最后一个批次
71        if !current_batch.is_empty() {
72            batches.push(Batch::new(
73                current_batch,
74                current_chars,
75                BatchType::Smart,
76                BatchPriority::Normal,
77            ));
78        }
79        
80        // 优化批次优先级
81        self.optimize_batch_priorities(&mut batches);
82        
83        batches
84    }
85    
86    /// 创建简单批次(仅按数量分割)
87    fn create_simple_batches(&self, items: Vec<TextItem>) -> Vec<Batch> {
88        items
89            .chunks(self.batch_size)
90            .map(|chunk| {
91                let chars = chunk.iter().map(|item| item.text.len()).sum();
92                Batch::new(
93                    chunk.to_vec(),
94                    chars,
95                    BatchType::Simple,
96                    BatchPriority::Normal,
97                )
98            })
99            .collect()
100    }
101    
102    /// 优化批次优先级
103    fn optimize_batch_priorities(&self, batches: &mut [Batch]) {
104        for batch in batches.iter_mut() {
105            // 基于内容类型设置优先级
106            let has_title = batch.items.iter().any(|item| {
107                matches!(item.text_type, crate::pipeline::collector::TextType::Title)
108            });
109            
110            let has_important_attrs = batch.items.iter().any(|item| {
111                matches!(item.text_type, 
112                    crate::pipeline::collector::TextType::ImageAlt |
113                    crate::pipeline::collector::TextType::FormLabel
114                )
115            });
116            
117            if has_title {
118                batch.priority = BatchPriority::High;
119            } else if has_important_attrs {
120                batch.priority = BatchPriority::Medium;
121            }
122            
123            // 小批次优先处理
124            if batch.items.len() <= 5 {
125                batch.priority = match batch.priority {
126                    BatchPriority::Low => BatchPriority::Medium,
127                    BatchPriority::Normal => BatchPriority::High,
128                    other => other,
129                };
130            }
131        }
132        
133        // 按优先级排序
134        batches.sort_by(|a, b| b.priority.cmp(&a.priority));
135    }
136    
137    /// 估算批次处理时间
138    pub fn estimate_processing_time(&self, batches: &[Batch]) -> std::time::Duration {
139        let base_time_per_batch = std::time::Duration::from_millis(500); // 基础API延迟
140        let time_per_char = std::time::Duration::from_nanos(100); // 每字符处理时间
141        
142        let total_time: std::time::Duration = batches
143            .iter()
144            .map(|batch| {
145                base_time_per_batch + time_per_char * batch.estimated_chars as u32
146            })
147            .sum();
148        
149        total_time
150    }
151}
152
153/// 翻译批次
154#[derive(Debug, Clone)]
155pub struct Batch {
156    /// 批次中的文本项
157    pub items: Vec<TextItem>,
158    
159    /// 估计的字符数
160    pub estimated_chars: usize,
161    
162    /// 批次类型
163    pub batch_type: BatchType,
164    
165    /// 批次优先级
166    pub priority: BatchPriority,
167    
168    /// 创建时间
169    pub created_at: std::time::Instant,
170}
171
172impl Batch {
173    /// 创建新批次
174    pub fn new(
175        items: Vec<TextItem>,
176        estimated_chars: usize,
177        batch_type: BatchType,
178        priority: BatchPriority,
179    ) -> Self {
180        Self {
181            items,
182            estimated_chars,
183            batch_type,
184            priority,
185            created_at: std::time::Instant::now(),
186        }
187    }
188    
189    /// 获取批次大小
190    pub fn size(&self) -> usize {
191        self.items.len()
192    }
193    
194    /// 是否为空批次
195    pub fn is_empty(&self) -> bool {
196        self.items.is_empty()
197    }
198    
199    /// 获取批次的平均文本长度
200    pub fn average_text_length(&self) -> f32 {
201        if self.items.is_empty() {
202            0.0
203        } else {
204            self.estimated_chars as f32 / self.items.len() as f32
205        }
206    }
207    
208    /// 拆分批次(如果太大)
209    pub fn split_if_needed(&self, max_size: usize, max_chars: usize) -> Vec<Batch> {
210        if self.items.len() <= max_size && self.estimated_chars <= max_chars {
211            return vec![self.clone()];
212        }
213        
214        let mut result = Vec::new();
215        let mut current_items = Vec::new();
216        let mut current_chars = 0;
217        
218        for item in &self.items {
219            let item_chars = item.text.len();
220            
221            if (current_items.len() >= max_size || current_chars + item_chars > max_chars)
222                && !current_items.is_empty() {
223                    result.push(Batch::new(
224                        std::mem::take(&mut current_items),
225                        current_chars,
226                        self.batch_type,
227                        self.priority,
228                    ));
229                    current_chars = 0;
230                }
231            
232            current_items.push(item.clone());
233            current_chars += item_chars;
234        }
235        
236        if !current_items.is_empty() {
237            result.push(Batch::new(
238                current_items,
239                current_chars,
240                self.batch_type,
241                self.priority,
242            ));
243        }
244        
245        result
246    }
247}
248
249/// 批次类型
250#[derive(Debug, Clone, Copy, PartialEq, Eq)]
251pub enum BatchType {
252    /// 简单批次(仅按数量分割)
253    Simple,
254    
255    /// 智能批次(考虑字符限制)
256    Smart,
257    
258    /// 优化批次(考虑内容相关性)
259    Optimized,
260}
261
262/// 批次优先级
263#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
264pub enum BatchPriority {
265    /// 低优先级
266    Low,
267    
268    /// 普通优先级
269    Normal,
270    
271    /// 中等优先级
272    Medium,
273    
274    /// 高优先级
275    High,
276    
277    /// 紧急优先级
278    Urgent,
279}
280
281impl Default for BatchManager {
282    fn default() -> Self {
283        Self {
284            batch_size: 20,
285            max_chars_per_batch: 8000,
286            smart_batching: true,
287        }
288    }
289}