1use thiserror::Error;
7
8#[derive(Debug, Clone)]
10pub struct ProcessorConfig {
11 pub chunk_size: usize,
13
14 pub parallel_threshold: usize,
16
17 pub max_threads: Option<usize>,
19
20 pub overlap_size: usize,
22}
23
24impl Default for ProcessorConfig {
25 fn default() -> Self {
26 Self {
27 chunk_size: 256 * 1024, parallel_threshold: 1024 * 1024, max_threads: None, overlap_size: 256, }
32 }
33}
34
35impl ProcessorConfig {
36 pub fn builder() -> ProcessorConfigBuilder {
38 ProcessorConfigBuilder::new()
39 }
40
41 pub fn small_text() -> Self {
43 Self {
44 chunk_size: 8 * 1024, parallel_threshold: usize::MAX, overlap_size: 64, ..Default::default()
48 }
49 }
50
51 pub fn large_text() -> Self {
53 Self {
54 chunk_size: 512 * 1024, parallel_threshold: 512 * 1024, max_threads: None, overlap_size: 512, }
59 }
60
61 pub fn streaming() -> Self {
63 Self {
64 chunk_size: 32 * 1024, parallel_threshold: 256 * 1024, max_threads: Some(2), overlap_size: 128, }
69 }
70
71 pub fn validate(&self) -> Result<(), ProcessingError> {
73 if self.chunk_size == 0 {
74 return Err(ProcessingError::InvalidConfig {
75 reason: "Chunk size must be greater than 0".to_string(),
76 });
77 }
78
79 if self.overlap_size >= self.chunk_size {
80 return Err(ProcessingError::InvalidConfig {
81 reason: "Overlap size must be less than chunk size".to_string(),
82 });
83 }
84
85 if let Some(threads) = self.max_threads {
86 if threads == 0 {
87 return Err(ProcessingError::InvalidConfig {
88 reason: "Max threads must be greater than 0".to_string(),
89 });
90 }
91 }
92
93 Ok(())
94 }
95}
96
97#[derive(Debug, Error)]
99pub enum ProcessingError {
100 #[error("Text too large for processing: {size} bytes (max: {max} bytes)")]
102 TextTooLarge { size: usize, max: usize },
103
104 #[error("Invalid configuration: {reason}")]
106 InvalidConfig { reason: String },
107
108 #[error("Parallel processing failed")]
110 ParallelError {
111 #[source]
112 source: Box<dyn std::error::Error + Send + Sync>,
113 },
114
115 #[error("Invalid UTF-8 in text at position {position}")]
117 Utf8Error { position: usize },
118
119 #[error("Failed to calculate chunk boundaries: {reason}")]
121 ChunkingError { reason: String },
122
123 #[error("Failed to find UTF-8 boundary at position {position}")]
125 Utf8BoundaryError { position: usize },
126
127 #[error("Failed to find word boundary near position {position}")]
129 WordBoundaryError { position: usize },
130
131 #[error("Invalid chunk boundaries: start={start}, end={end}, next={next}")]
133 InvalidChunkBoundaries {
134 start: usize,
135 end: usize,
136 next: usize,
137 },
138
139 #[error("Memory allocation failed: {reason}")]
141 AllocationError { reason: String },
142
143 #[error("I/O operation failed")]
145 IoError {
146 #[from]
147 source: std::io::Error,
148 },
149
150 #[error("Language rules processing failed: {reason}")]
152 LanguageRulesError { reason: String },
153
154 #[error("Other error: {0}")]
156 Other(String),
157}
158
159pub type ProcessingResult<T> = Result<T, ProcessingError>;
161
162#[derive(Debug, Clone)]
164pub struct ProcessorConfigBuilder {
165 config: ProcessorConfig,
166}
167
168impl ProcessorConfigBuilder {
169 pub fn new() -> Self {
171 Self {
172 config: ProcessorConfig::default(),
173 }
174 }
175
176 pub fn chunk_size(mut self, size: usize) -> Self {
178 self.config.chunk_size = size;
179 self
180 }
181
182 pub fn parallel_threshold(mut self, threshold: usize) -> Self {
184 self.config.parallel_threshold = threshold;
185 self
186 }
187
188 pub fn max_threads(mut self, threads: Option<usize>) -> Self {
190 self.config.max_threads = threads;
191 self
192 }
193
194 pub fn overlap_size(mut self, size: usize) -> Self {
196 self.config.overlap_size = size;
197 self
198 }
199
200 pub fn build(self) -> ProcessingResult<ProcessorConfig> {
202 self.config.validate()?;
203 Ok(self.config)
204 }
205
206 pub fn build_unchecked(self) -> ProcessorConfig {
208 self.config
209 }
210}
211
212impl Default for ProcessorConfigBuilder {
213 fn default() -> Self {
214 Self::new()
215 }
216}
217
218#[derive(Debug, Clone, Default)]
220pub struct ProcessingMetrics {
221 pub total_time_us: u64,
223
224 pub chunking_time_us: u64,
226
227 pub parallel_time_us: u64,
229
230 pub merge_time_us: u64,
232
233 pub chunk_count: usize,
235
236 pub thread_count: usize,
238
239 pub bytes_processed: usize,
241
242 pub boundaries_found: usize,
244}
245
246impl ProcessingMetrics {
247 pub fn throughput_mbps(&self) -> f64 {
249 if self.total_time_us == 0 {
250 return 0.0;
251 }
252
253 let mb = self.bytes_processed as f64 / (1024.0 * 1024.0);
254 let seconds = self.total_time_us as f64 / 1_000_000.0;
255 mb / seconds
256 }
257
258 pub fn parallel_efficiency(&self) -> f64 {
260 if self.thread_count <= 1 || self.parallel_time_us == 0 {
261 return 1.0;
262 }
263
264 let ideal_time = self.total_time_us as f64 / self.thread_count as f64;
266 let actual_time = self.parallel_time_us as f64;
267
268 (ideal_time / actual_time).min(1.0)
269 }
270}
271
272#[cfg(feature = "parallel")]
274#[derive(Debug, Clone)]
275pub struct ThreadPoolConfig {
276 pub num_threads: usize,
278
279 pub stack_size: Option<usize>,
281
282 pub thread_name_prefix: String,
284}
285
286#[cfg(feature = "parallel")]
287impl Default for ThreadPoolConfig {
288 fn default() -> Self {
289 Self {
290 num_threads: num_cpus::get(),
291 stack_size: None,
292 thread_name_prefix: "sakurs-worker".to_string(),
293 }
294 }
295}
296
297#[cfg(test)]
298mod tests {
299 use super::*;
300
301 #[test]
302 fn test_default_config() {
303 let config = ProcessorConfig::default();
304 assert_eq!(config.chunk_size, 256 * 1024);
305 assert_eq!(config.parallel_threshold, 1024 * 1024);
306 assert!(config.validate().is_ok());
307 }
308
309 #[test]
310 fn test_config_validation() {
311 let config = ProcessorConfig {
313 chunk_size: 0,
314 ..Default::default()
315 };
316 assert!(config.validate().is_err());
317
318 let config = ProcessorConfig {
320 chunk_size: 1024,
321 overlap_size: 2048,
322 ..Default::default()
323 };
324 assert!(config.validate().is_err());
325
326 let config = ProcessorConfig {
328 max_threads: Some(0),
329 ..Default::default()
330 };
331 assert!(config.validate().is_err());
332 }
333
334 #[test]
335 fn test_preset_configs() {
336 let small = ProcessorConfig::small_text();
337 assert_eq!(small.chunk_size, 8 * 1024);
338 assert_eq!(small.parallel_threshold, usize::MAX);
339
340 let large = ProcessorConfig::large_text();
341 assert_eq!(large.chunk_size, 512 * 1024);
342
343 let streaming = ProcessorConfig::streaming();
344 assert_eq!(streaming.max_threads, Some(2));
345 }
346
347 #[test]
348 fn test_processing_metrics() {
349 let metrics = ProcessingMetrics {
350 bytes_processed: 10 * 1024 * 1024, total_time_us: 1_000_000, ..Default::default()
353 };
354
355 assert_eq!(metrics.throughput_mbps(), 10.0);
356
357 let metrics_parallel = ProcessingMetrics {
358 bytes_processed: 10 * 1024 * 1024,
359 total_time_us: 1_000_000,
360 thread_count: 4,
361 parallel_time_us: 300_000, ..Default::default()
363 };
364 assert!(metrics_parallel.parallel_efficiency() > 0.8);
365 }
366}