1use crate::performance::{start_timer, update_compression_ratio};
4use crate::types::{FlateDecodeParams, LZWDecodeParams, PdfStream, StreamFilter};
5use serde::{Deserialize, Serialize};
6use std::collections::HashMap;
7use std::io::Write;
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct CompressionConfig {
11 pub level: CompressionLevel,
12 pub algorithm_preference: Vec<CompressionAlgorithm>,
13 pub adaptive_threshold: f64,
14 pub min_size_for_compression: usize,
15 pub enable_multi_pass: bool,
16 pub enable_dictionary_optimization: bool,
17 pub enable_predictor_optimization: bool,
18}
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
21pub enum CompressionLevel {
22 None,
23 Fast,
24 Balanced,
25 Best,
26 Adaptive,
27}
28
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
30pub enum CompressionAlgorithm {
31 Flate,
32 LZW,
33 RunLength,
34 CCITT,
35 JBIG2,
36 DCT,
37 JPX,
38 Custom(u8),
39}
40
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct CompressionResult {
43 pub original_size: usize,
44 pub compressed_size: usize,
45 pub ratio: f64,
46 pub algorithm: CompressionAlgorithm,
47 pub filters: Vec<StreamFilter>,
48 pub processing_time_ms: u64,
49 pub quality_score: f64,
50}
51
52#[derive(Debug, Clone, Default, Serialize, Deserialize)]
53pub struct CompressionStats {
54 pub total_original_bytes: u64,
55 pub total_compressed_bytes: u64,
56 pub overall_ratio: f64,
57 pub algorithm_performance: HashMap<CompressionAlgorithm, AlgorithmStats>,
58 pub content_type_ratios: HashMap<String, f64>,
59}
60
61#[derive(Debug, Clone, Default, Serialize, Deserialize)]
62pub struct AlgorithmStats {
63 pub uses: u64,
64 pub total_original: u64,
65 pub total_compressed: u64,
66 pub average_ratio: f64,
67 pub average_time_ms: f64,
68 pub best_ratio: f64,
69 pub worst_ratio: f64,
70}
71
72impl Default for CompressionConfig {
73 fn default() -> Self {
74 Self {
75 level: CompressionLevel::Balanced,
76 algorithm_preference: vec![
77 CompressionAlgorithm::Flate,
78 CompressionAlgorithm::LZW,
79 CompressionAlgorithm::RunLength,
80 ],
81 adaptive_threshold: 0.1,
82 min_size_for_compression: 100,
83 enable_multi_pass: true,
84 enable_dictionary_optimization: true,
85 enable_predictor_optimization: true,
86 }
87 }
88}
89
90pub struct AdvancedCompressor {
91 config: CompressionConfig,
92 stats: CompressionStats,
93 content_analyzer: ContentAnalyzer,
94 adaptive_engine: AdaptiveCompressionEngine,
95}
96
97impl AdvancedCompressor {
98 pub fn new(config: CompressionConfig) -> Self {
99 Self {
100 config,
101 stats: CompressionStats::default(),
102 content_analyzer: ContentAnalyzer::new(),
103 adaptive_engine: AdaptiveCompressionEngine::new(),
104 }
105 }
106
107 pub fn compress_stream(&mut self, stream: &PdfStream) -> Result<CompressionResult, String> {
108 let timer = start_timer("stream_compression");
109
110 if stream.data.len() < self.config.min_size_for_compression {
111 return Ok(CompressionResult {
112 original_size: stream.data.len(),
113 compressed_size: stream.data.len(),
114 ratio: 1.0,
115 algorithm: CompressionAlgorithm::Custom(0),
116 filters: vec![],
117 processing_time_ms: timer.finish(),
118 quality_score: 1.0,
119 });
120 }
121
122 let data_bytes = stream
123 .data
124 .as_bytes()
125 .ok_or_else(|| "Cannot access lazy stream data".to_string())?;
126 let content_type = self.content_analyzer.analyze_content(data_bytes);
127 let best_algorithm = self.select_optimal_algorithm(data_bytes, &content_type);
128
129 let result = match self.config.level {
130 CompressionLevel::Adaptive => self.adaptive_compress(stream, &content_type),
131 _ => self.standard_compress(stream, best_algorithm),
132 }?;
133
134 self.update_stats(&result, &content_type);
135 update_compression_ratio(result.ratio);
136
137 let elapsed = timer.finish();
138 Ok(CompressionResult {
139 processing_time_ms: elapsed,
140 ..result
141 })
142 }
143
144 fn select_optimal_algorithm(
145 &self,
146 data: &[u8],
147 content_type: &ContentType,
148 ) -> CompressionAlgorithm {
149 match content_type {
150 ContentType::Text => CompressionAlgorithm::Flate,
151 ContentType::Image => CompressionAlgorithm::DCT,
152 ContentType::Vector => CompressionAlgorithm::Flate,
153 ContentType::Binary => CompressionAlgorithm::LZW,
154 ContentType::Structured => CompressionAlgorithm::Flate,
155 ContentType::Unknown => {
156 if self.has_repetitive_patterns(data) {
157 CompressionAlgorithm::LZW
158 } else {
159 CompressionAlgorithm::Flate
160 }
161 }
162 }
163 }
164
165 fn has_repetitive_patterns(&self, data: &[u8]) -> bool {
166 if data.len() < 1000 {
167 return false;
168 }
169
170 let mut byte_counts = [0u32; 256];
171 for &byte in data.iter().take(1000) {
172 byte_counts[byte as usize] += 1;
173 }
174
175 let max_count = byte_counts.iter().max().unwrap_or(&0);
176 *max_count > 100
177 }
178
179 fn standard_compress(
180 &self,
181 stream: &PdfStream,
182 algorithm: CompressionAlgorithm,
183 ) -> Result<CompressionResult, String> {
184 let data_bytes = stream
185 .data
186 .as_bytes()
187 .ok_or_else(|| "Cannot access lazy stream data".to_string())?;
188 match algorithm {
189 CompressionAlgorithm::Flate => self.compress_flate(data_bytes),
190 CompressionAlgorithm::LZW => self.compress_lzw(data_bytes),
191 CompressionAlgorithm::RunLength => self.compress_run_length(data_bytes),
192 _ => Err(format!(
193 "Unsupported compression algorithm: {:?}",
194 algorithm
195 )),
196 }
197 }
198
199 fn adaptive_compress(
200 &mut self,
201 stream: &PdfStream,
202 content_type: &ContentType,
203 ) -> Result<CompressionResult, String> {
204 let candidates = self.get_algorithm_candidates(content_type);
205 let mut best_result = None;
206 let mut best_ratio = f64::INFINITY;
207
208 for algorithm in candidates {
209 if let Ok(result) = self.standard_compress(stream, algorithm) {
210 if result.ratio < best_ratio {
211 best_ratio = result.ratio;
212 best_result = Some(result);
213 }
214 }
215 }
216
217 best_result.ok_or_else(|| "No compression algorithm succeeded".to_string())
218 }
219
220 fn get_algorithm_candidates(&self, content_type: &ContentType) -> Vec<CompressionAlgorithm> {
221 let mut candidates = self.config.algorithm_preference.clone();
222
223 match content_type {
224 ContentType::Image => {
225 candidates.insert(0, CompressionAlgorithm::DCT);
226 candidates.insert(1, CompressionAlgorithm::JPX);
227 }
228 ContentType::Text | ContentType::Structured => {
229 candidates.insert(0, CompressionAlgorithm::Flate);
230 }
231 ContentType::Binary => {
232 candidates.insert(0, CompressionAlgorithm::LZW);
233 }
234 _ => {}
235 }
236
237 candidates
238 }
239
240 fn compress_flate(&self, data: &[u8]) -> Result<CompressionResult, String> {
241 use flate2::write::ZlibEncoder;
242 use flate2::Compression;
243 use std::io::Write;
244
245 let compression_level = match self.config.level {
246 CompressionLevel::Fast => Compression::fast(),
247 CompressionLevel::Best => Compression::best(),
248 _ => Compression::default(),
249 };
250
251 let mut encoder = ZlibEncoder::new(Vec::new(), compression_level);
252 encoder.write_all(data).map_err(|e| e.to_string())?;
253 let compressed = encoder.finish().map_err(|e| e.to_string())?;
254
255 Ok(CompressionResult {
256 original_size: data.len(),
257 compressed_size: compressed.len(),
258 ratio: compressed.len() as f64 / data.len() as f64,
259 algorithm: CompressionAlgorithm::Flate,
260 filters: vec![StreamFilter::FlateDecode(FlateDecodeParams::default())],
261 processing_time_ms: 0,
262 quality_score: self.calculate_quality_score(data, &compressed),
263 })
264 }
265
266 fn compress_lzw(&self, data: &[u8]) -> Result<CompressionResult, String> {
267 let compressed = encode_lzw_pdf(data, true)?;
268
269 Ok(CompressionResult {
270 original_size: data.len(),
271 compressed_size: compressed.len(),
272 ratio: compressed.len() as f64 / data.len() as f64,
273 algorithm: CompressionAlgorithm::LZW,
274 filters: vec![StreamFilter::LZWDecode(LZWDecodeParams {
275 early_change: Some(true),
276 ..LZWDecodeParams::default()
277 })],
278 processing_time_ms: 0,
279 quality_score: self.calculate_quality_score(data, &compressed),
280 })
281 }
282
283 fn compress_run_length(&self, data: &[u8]) -> Result<CompressionResult, String> {
284 let mut compressed = Vec::new();
285 let mut i = 0;
286
287 while i < data.len() {
288 let current_byte = data[i];
289 let mut run_length = 1;
290
291 while i + run_length < data.len()
292 && data[i + run_length] == current_byte
293 && run_length < 128
294 {
295 run_length += 1;
296 }
297
298 if run_length > 1 {
299 compressed.push((257 - run_length) as u8);
300 compressed.push(current_byte);
301 i += run_length;
302 } else {
303 let mut literal_run = 0;
304 let start_i = i;
305
306 while i < data.len() && literal_run < 128 {
307 if i + 1 < data.len() && data[i] == data[i + 1] {
308 if literal_run > 0 {
309 break;
310 }
311 run_length = 2;
312 while i + run_length < data.len()
313 && data[i + run_length] == data[i]
314 && run_length < 128
315 {
316 run_length += 1;
317 }
318 if run_length > 2 {
319 break;
320 }
321 }
322 i += 1;
323 literal_run += 1;
324 }
325
326 if literal_run > 0 {
327 compressed.push((literal_run - 1) as u8);
328 compressed.extend_from_slice(&data[start_i..start_i + literal_run]);
329 }
330 }
331 }
332
333 compressed.push(128);
334
335 Ok(CompressionResult {
336 original_size: data.len(),
337 compressed_size: compressed.len(),
338 ratio: compressed.len() as f64 / data.len() as f64,
339 algorithm: CompressionAlgorithm::RunLength,
340 filters: vec![StreamFilter::RunLengthDecode],
341 processing_time_ms: 0,
342 quality_score: self.calculate_quality_score(data, &compressed),
343 })
344 }
345
346 fn calculate_quality_score(&self, original: &[u8], compressed: &[u8]) -> f64 {
347 let ratio = compressed.len() as f64 / original.len() as f64;
348 let compression_efficiency = 1.0 - ratio;
349
350 let entropy_original = self.calculate_entropy(original);
351 let theoretical_limit = entropy_original / 8.0;
352
353 let efficiency_score = if theoretical_limit > 0.0 {
354 compression_efficiency / theoretical_limit
355 } else {
356 0.0
357 };
358
359 efficiency_score.min(1.0)
360 }
361
362 fn calculate_entropy(&self, data: &[u8]) -> f64 {
363 let mut counts = [0u32; 256];
364 for &byte in data {
365 counts[byte as usize] += 1;
366 }
367
368 let length = data.len() as f64;
369 let mut entropy = 0.0;
370
371 for &count in &counts {
372 if count > 0 {
373 let p = count as f64 / length;
374 entropy -= p * p.log2();
375 }
376 }
377
378 entropy
379 }
380
381 fn update_stats(&mut self, result: &CompressionResult, content_type: &ContentType) {
382 self.stats.total_original_bytes += result.original_size as u64;
383 self.stats.total_compressed_bytes += result.compressed_size as u64;
384 self.stats.overall_ratio =
385 self.stats.total_compressed_bytes as f64 / self.stats.total_original_bytes as f64;
386
387 let algo_stats = self
388 .stats
389 .algorithm_performance
390 .entry(result.algorithm)
391 .or_default();
392
393 algo_stats.uses += 1;
394 algo_stats.total_original += result.original_size as u64;
395 algo_stats.total_compressed += result.compressed_size as u64;
396 algo_stats.average_ratio =
397 algo_stats.total_compressed as f64 / algo_stats.total_original as f64;
398
399 if algo_stats.uses == 1 {
400 algo_stats.best_ratio = result.ratio;
401 algo_stats.worst_ratio = result.ratio;
402 } else {
403 algo_stats.best_ratio = algo_stats.best_ratio.min(result.ratio);
404 algo_stats.worst_ratio = algo_stats.worst_ratio.max(result.ratio);
405 }
406
407 let content_type_key = format!("{:?}", content_type);
408 let content_ratio = self
409 .stats
410 .content_type_ratios
411 .entry(content_type_key)
412 .or_insert(0.0);
413 *content_ratio = (*content_ratio + result.ratio) / 2.0;
414 }
415
416 pub fn get_stats(&self) -> &CompressionStats {
417 &self.stats
418 }
419
420 pub fn optimize_for_content(&mut self, content_samples: &[(Vec<u8>, ContentType)]) {
421 self.adaptive_engine.train(content_samples);
422 self.update_algorithm_preferences();
423 }
424
425 fn update_algorithm_preferences(&mut self) {
426 let recommendations = self.adaptive_engine.get_recommendations();
427 self.config.algorithm_preference = recommendations;
428 }
429}
430
431fn encode_lzw_pdf(data: &[u8], early_change: bool) -> Result<Vec<u8>, String> {
432 use lzw::BitWriter;
433
434 let mut compressed = Vec::new();
435 {
436 let mut writer = lzw::MsbWriter::new(&mut compressed);
437 let clear_code: u16 = 256;
438 let end_code: u16 = 257;
439 let mut code_size: u8 = 9;
440 let max_code_size: u8 = 12;
441 let mut next_code: u16 = 258;
442
443 let mut dict: std::collections::HashMap<Vec<u8>, u16> = std::collections::HashMap::new();
444 for i in 0u16..=255 {
445 dict.insert(vec![i as u8], i);
446 }
447
448 writer
449 .write_bits(clear_code, code_size)
450 .map_err(|e| format!("LZW write error: {}", e))?;
451
452 let mut w: Vec<u8> = Vec::new();
453 for &k in data {
454 let mut w_plus = w.clone();
455 w_plus.push(k);
456 if dict.contains_key(&w_plus) {
457 w = w_plus;
458 continue;
459 }
460
461 if !w.is_empty() {
462 let code = *dict.get(&w).ok_or("LZW missing code")?;
463 writer
464 .write_bits(code, code_size)
465 .map_err(|e| format!("LZW write error: {}", e))?;
466 } else {
467 writer
468 .write_bits(k as u16, code_size)
469 .map_err(|e| format!("LZW write error: {}", e))?;
470 }
471
472 if next_code < (1u16 << max_code_size) {
473 dict.insert(w_plus, next_code);
474 next_code += 1;
475 let offset = if early_change { 1 } else { 0 };
476 let threshold = (1u16 << code_size) - 1 - offset;
477 if next_code == threshold && code_size < max_code_size {
478 code_size += 1;
479 }
480 } else {
481 writer
482 .write_bits(clear_code, code_size)
483 .map_err(|e| format!("LZW write error: {}", e))?;
484 dict.clear();
485 for i in 0u16..=255 {
486 dict.insert(vec![i as u8], i);
487 }
488 code_size = 9;
489 next_code = 258;
490 }
491
492 w.clear();
493 w.push(k);
494 }
495
496 if !w.is_empty() {
497 let code = *dict.get(&w).ok_or("LZW missing final code")?;
498 writer
499 .write_bits(code, code_size)
500 .map_err(|e| format!("LZW write error: {}", e))?;
501 }
502
503 writer
504 .write_bits(end_code, code_size)
505 .map_err(|e| format!("LZW write error: {}", e))?;
506 writer
507 .flush()
508 .map_err(|e| format!("LZW flush error: {}", e))?;
509 }
510
511 Ok(compressed)
512}
513
514#[cfg(test)]
515mod tests {
516 use super::*;
517
518 #[test]
519 fn test_lzw_encode_decode_roundtrip() {
520 let data = b"TOBEORNOTTOBEORTOBEORNOT";
521 let compressed = encode_lzw_pdf(data, true).expect("LZW encode failed");
522 let filters = vec![StreamFilter::LZWDecode(LZWDecodeParams {
523 early_change: Some(true),
524 ..LZWDecodeParams::default()
525 })];
526 let decoded =
527 crate::filters::decode_stream(&compressed, &filters).expect("LZW decode failed");
528 assert_eq!(decoded, data);
529 }
530}
531
532#[derive(Debug, Clone, PartialEq, Eq, Hash)]
533pub enum ContentType {
534 Text,
535 Image,
536 Vector,
537 Binary,
538 Structured,
539 Unknown,
540}
541
542pub struct ContentAnalyzer {
543 text_patterns: Vec<&'static [u8]>,
544 image_signatures: Vec<&'static [u8]>,
545}
546
547impl Default for ContentAnalyzer {
548 fn default() -> Self {
549 Self::new()
550 }
551}
552
553impl ContentAnalyzer {
554 pub fn new() -> Self {
555 Self {
556 text_patterns: vec![b"BT", b"ET", b"Tf", b"TJ", b"Tj"],
557 image_signatures: vec![
558 b"\xFF\xD8\xFF", b"\x89PNG\r\n\x1A\n", b"GIF87a", b"GIF89a", ],
563 }
564 }
565
566 pub fn analyze_content(&self, data: &[u8]) -> ContentType {
567 if self.is_image_content(data) {
568 ContentType::Image
569 } else if self.is_text_content(data) {
570 ContentType::Text
571 } else if self.is_vector_content(data) {
572 ContentType::Vector
573 } else if self.is_structured_content(data) {
574 ContentType::Structured
575 } else {
576 ContentType::Binary
577 }
578 }
579
580 fn is_image_content(&self, data: &[u8]) -> bool {
581 self.image_signatures
582 .iter()
583 .any(|sig| data.starts_with(sig))
584 }
585
586 fn is_text_content(&self, data: &[u8]) -> bool {
587 if data.is_empty() {
588 return false;
589 }
590
591 let ascii_count = data.iter().take(1000).filter(|&&b| b.is_ascii()).count();
592 let ratio = ascii_count as f64 / data.len().min(1000) as f64;
593
594 ratio > 0.8
595 || self
596 .text_patterns
597 .iter()
598 .any(|pattern| data.windows(pattern.len()).any(|window| window == *pattern))
599 }
600
601 fn is_vector_content(&self, data: &[u8]) -> bool {
602 let vector_ops: &[&[u8]] = &[
603 b"m ", b"l ", b"c ", b"v ", b"y ", b"h ", b"re ", b"S ", b"s ", b"f ", b"F ", b"B ",
604 ];
605 let matches = vector_ops
606 .iter()
607 .map(|op| {
608 data.windows(op.len())
609 .filter(|window| *window == *op)
610 .count()
611 })
612 .sum::<usize>();
613
614 matches > data.len() / 100
615 }
616
617 fn is_structured_content(&self, data: &[u8]) -> bool {
618 let structured_markers: &[&[u8]] = &[b"<<", b">>", b"[", b"]", b"/", b"obj", b"endobj"];
619 let matches = structured_markers
620 .iter()
621 .map(|marker| {
622 data.windows(marker.len())
623 .filter(|window| *window == *marker)
624 .count()
625 })
626 .sum::<usize>();
627
628 matches > data.len() / 50
629 }
630}
631
632pub struct AdaptiveCompressionEngine {
633 algorithm_scores: HashMap<CompressionAlgorithm, f64>,
634 content_type_preferences: HashMap<ContentType, Vec<CompressionAlgorithm>>,
635}
636
637impl Default for AdaptiveCompressionEngine {
638 fn default() -> Self {
639 Self::new()
640 }
641}
642
643impl AdaptiveCompressionEngine {
644 pub fn new() -> Self {
645 let mut algorithm_scores = HashMap::new();
646 algorithm_scores.insert(CompressionAlgorithm::Flate, 0.8);
647 algorithm_scores.insert(CompressionAlgorithm::LZW, 0.7);
648 algorithm_scores.insert(CompressionAlgorithm::RunLength, 0.5);
649 algorithm_scores.insert(CompressionAlgorithm::DCT, 0.9);
650 algorithm_scores.insert(CompressionAlgorithm::JPX, 0.95);
651
652 let mut content_type_preferences = HashMap::new();
653 content_type_preferences.insert(
654 ContentType::Text,
655 vec![CompressionAlgorithm::Flate, CompressionAlgorithm::LZW],
656 );
657 content_type_preferences.insert(
658 ContentType::Image,
659 vec![
660 CompressionAlgorithm::DCT,
661 CompressionAlgorithm::JPX,
662 CompressionAlgorithm::Flate,
663 ],
664 );
665 content_type_preferences.insert(
666 ContentType::Vector,
667 vec![CompressionAlgorithm::Flate, CompressionAlgorithm::LZW],
668 );
669 content_type_preferences.insert(
670 ContentType::Binary,
671 vec![CompressionAlgorithm::LZW, CompressionAlgorithm::Flate],
672 );
673 content_type_preferences.insert(ContentType::Structured, vec![CompressionAlgorithm::Flate]);
674
675 Self {
676 algorithm_scores,
677 content_type_preferences,
678 }
679 }
680
681 pub fn train(&mut self, samples: &[(Vec<u8>, ContentType)]) {
682 for (data, content_type) in samples {
683 self.evaluate_algorithms_for_content(data, content_type);
684 }
685 }
686
687 fn evaluate_algorithms_for_content(&mut self, _data: &[u8], content_type: &ContentType) {
688 if let Some(preferences) = self.content_type_preferences.get_mut(content_type) {
689 preferences.sort_by(|a, b| {
690 let score_a = self.algorithm_scores.get(a).unwrap_or(&0.0);
691 let score_b = self.algorithm_scores.get(b).unwrap_or(&0.0);
692 score_b
693 .partial_cmp(score_a)
694 .unwrap_or(std::cmp::Ordering::Equal)
695 });
696 }
697 }
698
699 pub fn get_recommendations(&self) -> Vec<CompressionAlgorithm> {
700 let mut algorithms: Vec<_> = self.algorithm_scores.iter().collect();
701 algorithms.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
702 algorithms.into_iter().map(|(algo, _)| *algo).collect()
703 }
704}
705
706pub fn create_optimal_compressor() -> AdvancedCompressor {
707 let config = CompressionConfig {
708 level: CompressionLevel::Adaptive,
709 algorithm_preference: vec![
710 CompressionAlgorithm::Flate,
711 CompressionAlgorithm::LZW,
712 CompressionAlgorithm::DCT,
713 CompressionAlgorithm::RunLength,
714 ],
715 adaptive_threshold: 0.05,
716 min_size_for_compression: 50,
717 enable_multi_pass: true,
718 enable_dictionary_optimization: true,
719 enable_predictor_optimization: true,
720 };
721
722 AdvancedCompressor::new(config)
723}