html_translation_lib/pipeline/
batch.rs1use crate::config::TranslationConfig;
6use crate::pipeline::collector::TextItem;
7
8pub struct BatchManager {
10 batch_size: usize,
12
13 max_chars_per_batch: usize,
15
16 smart_batching: bool,
18}
19
20impl BatchManager {
21 pub fn new(config: &TranslationConfig) -> Self {
23 Self {
24 batch_size: config.batch_size,
25 max_chars_per_batch: 8000, smart_batching: true,
27 }
28 }
29
30 pub fn create_batches(&self, items: Vec<TextItem>) -> Vec<Batch> {
32 if items.is_empty() {
33 return Vec::new();
34 }
35
36 if self.smart_batching {
37 self.create_smart_batches(items)
38 } else {
39 self.create_simple_batches(items)
40 }
41 }
42
43 fn create_smart_batches(&self, items: Vec<TextItem>) -> Vec<Batch> {
45 let mut batches = Vec::new();
46 let mut current_batch = Vec::new();
47 let mut current_chars = 0;
48
49 for item in items {
50 let item_chars = item.text.len();
51
52 if !current_batch.is_empty() &&
54 (current_batch.len() >= self.batch_size ||
55 current_chars + item_chars > self.max_chars_per_batch) {
56
57 batches.push(Batch::new(
58 std::mem::take(&mut current_batch),
59 current_chars,
60 BatchType::Smart,
61 BatchPriority::Normal,
62 ));
63 current_chars = 0;
64 }
65
66 current_batch.push(item);
67 current_chars += item_chars;
68 }
69
70 if !current_batch.is_empty() {
72 batches.push(Batch::new(
73 current_batch,
74 current_chars,
75 BatchType::Smart,
76 BatchPriority::Normal,
77 ));
78 }
79
80 self.optimize_batch_priorities(&mut batches);
82
83 batches
84 }
85
86 fn create_simple_batches(&self, items: Vec<TextItem>) -> Vec<Batch> {
88 items
89 .chunks(self.batch_size)
90 .map(|chunk| {
91 let chars = chunk.iter().map(|item| item.text.len()).sum();
92 Batch::new(
93 chunk.to_vec(),
94 chars,
95 BatchType::Simple,
96 BatchPriority::Normal,
97 )
98 })
99 .collect()
100 }
101
102 fn optimize_batch_priorities(&self, batches: &mut [Batch]) {
104 for batch in batches.iter_mut() {
105 let has_title = batch.items.iter().any(|item| {
107 matches!(item.text_type, crate::pipeline::collector::TextType::Title)
108 });
109
110 let has_important_attrs = batch.items.iter().any(|item| {
111 matches!(item.text_type,
112 crate::pipeline::collector::TextType::ImageAlt |
113 crate::pipeline::collector::TextType::FormLabel
114 )
115 });
116
117 if has_title {
118 batch.priority = BatchPriority::High;
119 } else if has_important_attrs {
120 batch.priority = BatchPriority::Medium;
121 }
122
123 if batch.items.len() <= 5 {
125 batch.priority = match batch.priority {
126 BatchPriority::Low => BatchPriority::Medium,
127 BatchPriority::Normal => BatchPriority::High,
128 other => other,
129 };
130 }
131 }
132
133 batches.sort_by(|a, b| b.priority.cmp(&a.priority));
135 }
136
137 pub fn estimate_processing_time(&self, batches: &[Batch]) -> std::time::Duration {
139 let base_time_per_batch = std::time::Duration::from_millis(500); let time_per_char = std::time::Duration::from_nanos(100); let total_time: std::time::Duration = batches
143 .iter()
144 .map(|batch| {
145 base_time_per_batch + time_per_char * batch.estimated_chars as u32
146 })
147 .sum();
148
149 total_time
150 }
151}
152
153#[derive(Debug, Clone)]
155pub struct Batch {
156 pub items: Vec<TextItem>,
158
159 pub estimated_chars: usize,
161
162 pub batch_type: BatchType,
164
165 pub priority: BatchPriority,
167
168 pub created_at: std::time::Instant,
170}
171
172impl Batch {
173 pub fn new(
175 items: Vec<TextItem>,
176 estimated_chars: usize,
177 batch_type: BatchType,
178 priority: BatchPriority,
179 ) -> Self {
180 Self {
181 items,
182 estimated_chars,
183 batch_type,
184 priority,
185 created_at: std::time::Instant::now(),
186 }
187 }
188
189 pub fn size(&self) -> usize {
191 self.items.len()
192 }
193
194 pub fn is_empty(&self) -> bool {
196 self.items.is_empty()
197 }
198
199 pub fn average_text_length(&self) -> f32 {
201 if self.items.is_empty() {
202 0.0
203 } else {
204 self.estimated_chars as f32 / self.items.len() as f32
205 }
206 }
207
208 pub fn split_if_needed(&self, max_size: usize, max_chars: usize) -> Vec<Batch> {
210 if self.items.len() <= max_size && self.estimated_chars <= max_chars {
211 return vec![self.clone()];
212 }
213
214 let mut result = Vec::new();
215 let mut current_items = Vec::new();
216 let mut current_chars = 0;
217
218 for item in &self.items {
219 let item_chars = item.text.len();
220
221 if (current_items.len() >= max_size || current_chars + item_chars > max_chars)
222 && !current_items.is_empty() {
223 result.push(Batch::new(
224 std::mem::take(&mut current_items),
225 current_chars,
226 self.batch_type,
227 self.priority,
228 ));
229 current_chars = 0;
230 }
231
232 current_items.push(item.clone());
233 current_chars += item_chars;
234 }
235
236 if !current_items.is_empty() {
237 result.push(Batch::new(
238 current_items,
239 current_chars,
240 self.batch_type,
241 self.priority,
242 ));
243 }
244
245 result
246 }
247}
248
249#[derive(Debug, Clone, Copy, PartialEq, Eq)]
251pub enum BatchType {
252 Simple,
254
255 Smart,
257
258 Optimized,
260}
261
262#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
264pub enum BatchPriority {
265 Low,
267
268 Normal,
270
271 Medium,
273
274 High,
276
277 Urgent,
279}
280
281impl Default for BatchManager {
282 fn default() -> Self {
283 Self {
284 batch_size: 20,
285 max_chars_per_batch: 8000,
286 smart_batching: true,
287 }
288 }
289}