use crate::error::Result;
use std::collections::HashMap;
use std::time::{Duration, Instant};
pub struct IntelligentCompressor {
strategies: HashMap<ContentType, CompressionStrategy>,
stats: CompressionStats,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ContentType {
Text,
VectorGraphics,
ImageJpeg,
ImagePng,
ImageUncompressed,
FontData,
Metadata,
ContentStream,
CrossReference,
Unknown,
}
impl ContentType {
pub fn analyze(data: &[u8]) -> Self {
if data.len() < 4 {
return ContentType::Unknown;
}
match &data[0..4] {
[0xFF, 0xD8, 0xFF, _] => return ContentType::ImageJpeg,
[0x89, 0x50, 0x4E, 0x47] => return ContentType::ImagePng,
[0x42, 0x4D, _, _] => return ContentType::ImageUncompressed, _ => {}
}
if data.len() > 8 {
if &data[0..4] == b"OTTO" || &data[0..4] == b"\x00\x01\x00\x00" {
return ContentType::FontData;
}
}
let text_chars = data
.iter()
.filter(|&&b| b.is_ascii_graphic() || b.is_ascii_whitespace())
.count();
let text_ratio = text_chars as f32 / data.len() as f32;
if text_ratio > 0.8 {
if Self::has_pdf_operators(data) {
ContentType::ContentStream
} else if Self::has_metadata_patterns(data) {
ContentType::Metadata
} else {
ContentType::Text
}
} else if text_ratio > 0.4 {
ContentType::VectorGraphics
} else if Self::has_xref_patterns(data) {
ContentType::CrossReference
} else {
ContentType::Unknown
}
}
fn has_pdf_operators(data: &[u8]) -> bool {
let content = String::from_utf8_lossy(data);
content.contains(" Td")
|| content.contains(" Tj")
|| content.contains(" re")
|| content.contains(" l")
|| content.contains("BT")
|| content.contains("ET")
}
fn has_metadata_patterns(data: &[u8]) -> bool {
let content = String::from_utf8_lossy(data);
content.contains("/Type")
|| content.contains("/Creator")
|| content.contains("/Producer")
|| content.contains("/Title")
}
fn has_xref_patterns(data: &[u8]) -> bool {
let content = String::from_utf8_lossy(data);
content.starts_with("xref") || content.contains(" 0 n") || content.contains(" f ")
}
}
#[derive(Debug, Clone)]
pub struct CompressionStrategy {
pub algorithm: CompressionAlgorithm,
pub level: u32,
pub window_size: Option<u32>,
pub dictionary: Option<Vec<u8>>,
pub should_compress: bool,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum CompressionAlgorithm {
None,
Flate,
RunLength,
LZW,
Dictionary,
}
impl Default for CompressionStrategy {
fn default() -> Self {
Self {
algorithm: CompressionAlgorithm::Flate,
level: 6,
window_size: None,
dictionary: None,
should_compress: true,
}
}
}
impl CompressionStrategy {
pub fn max_compression() -> Self {
Self {
algorithm: CompressionAlgorithm::Flate,
level: 9,
window_size: Some(32768), dictionary: None,
should_compress: true,
}
}
pub fn fast_compression() -> Self {
Self {
algorithm: CompressionAlgorithm::Flate,
level: 1,
window_size: Some(1024), dictionary: None,
should_compress: true,
}
}
pub fn no_compression() -> Self {
Self {
algorithm: CompressionAlgorithm::None,
level: 0,
window_size: None,
dictionary: None,
should_compress: false,
}
}
pub fn with_dictionary(dictionary: Vec<u8>) -> Self {
Self {
algorithm: CompressionAlgorithm::Dictionary,
level: 6,
window_size: None,
dictionary: Some(dictionary),
should_compress: true,
}
}
}
impl IntelligentCompressor {
pub fn new() -> Self {
let mut strategies = HashMap::new();
strategies.insert(ContentType::Text, CompressionStrategy::max_compression());
strategies.insert(
ContentType::VectorGraphics,
CompressionStrategy {
algorithm: CompressionAlgorithm::Flate,
level: 7,
window_size: Some(16384), dictionary: None,
should_compress: true,
},
);
strategies.insert(
ContentType::ImageJpeg,
CompressionStrategy::no_compression(),
);
strategies.insert(
ContentType::ImagePng,
CompressionStrategy {
algorithm: CompressionAlgorithm::Flate,
level: 3,
window_size: Some(8192),
dictionary: None,
should_compress: true,
},
);
strategies.insert(
ContentType::ImageUncompressed,
CompressionStrategy::max_compression(),
);
strategies.insert(
ContentType::FontData,
CompressionStrategy {
algorithm: CompressionAlgorithm::Flate,
level: 5,
window_size: Some(4096),
dictionary: None,
should_compress: true,
},
);
strategies.insert(
ContentType::Metadata,
CompressionStrategy {
algorithm: CompressionAlgorithm::Flate,
level: 4,
window_size: Some(2048),
dictionary: None,
should_compress: true,
},
);
strategies.insert(ContentType::ContentStream, CompressionStrategy::default());
strategies.insert(
ContentType::CrossReference,
CompressionStrategy {
algorithm: CompressionAlgorithm::Flate,
level: 8,
window_size: Some(1024),
dictionary: None,
should_compress: true,
},
);
strategies.insert(
ContentType::Unknown,
CompressionStrategy::fast_compression(),
);
Self {
strategies,
stats: CompressionStats::default(),
}
}
pub fn compress(&mut self, data: Vec<u8>, content_type: ContentType) -> Result<CompressedData> {
let start = Instant::now();
let original_size = data.len();
let default_strategy = CompressionStrategy::default();
let strategy = self
.strategies
.get(&content_type)
.unwrap_or(&default_strategy);
let result = if strategy.should_compress && original_size > 100 {
self.compress_with_strategy(&data, strategy)?
} else {
CompressedData {
data,
algorithm: CompressionAlgorithm::None,
original_size,
compressed_size: original_size,
compression_time: Duration::ZERO,
}
};
self.stats.total_operations += 1;
self.stats.total_original_size += original_size;
self.stats.total_compressed_size += result.compressed_size;
self.stats.total_compression_time += start.elapsed();
let type_stats = self
.stats
.by_content_type
.entry(content_type)
.or_insert_with(ContentTypeStats::default);
type_stats.operations += 1;
type_stats.original_size += original_size;
type_stats.compressed_size += result.compressed_size;
type_stats.compression_time += start.elapsed();
Ok(result)
}
fn compress_with_strategy(
&self,
data: &[u8],
strategy: &CompressionStrategy,
) -> Result<CompressedData> {
let start = Instant::now();
let original_size = data.len();
let compressed = match strategy.algorithm {
CompressionAlgorithm::None => data.to_vec(),
CompressionAlgorithm::Flate => self.compress_flate(data, strategy.level)?,
CompressionAlgorithm::RunLength => self.compress_run_length(data)?,
CompressionAlgorithm::LZW => self.compress_lzw(data)?,
CompressionAlgorithm::Dictionary => self.compress_with_dictionary(data, strategy)?,
};
Ok(CompressedData {
data: compressed.clone(),
algorithm: strategy.algorithm,
original_size,
compressed_size: compressed.len(),
compression_time: start.elapsed(),
})
}
fn compress_flate(&self, data: &[u8], level: u32) -> Result<Vec<u8>> {
use flate2::{write::ZlibEncoder, Compression};
use std::io::Write;
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::new(level));
encoder.write_all(data)?;
let compressed = encoder.finish()?;
Ok(compressed)
}
fn compress_run_length(&self, data: &[u8]) -> Result<Vec<u8>> {
let mut compressed = Vec::new();
let mut i = 0;
while i < data.len() {
let current = data[i];
let mut count = 1;
while i + count < data.len() && data[i + count] == current && count < 255 {
count += 1;
}
if count >= 3 {
compressed.push(count as u8);
compressed.push(current);
} else {
for _ in 0..count {
compressed.push(current);
}
}
i += count;
}
Ok(compressed)
}
fn compress_lzw(&self, data: &[u8]) -> Result<Vec<u8>> {
let mut dictionary: HashMap<Vec<u8>, u16> = HashMap::new();
let mut result = Vec::new();
let mut dict_size = 256u16;
for i in 0..256 {
dictionary.insert(vec![i as u8], i as u16);
}
let mut current = Vec::new();
for &byte in data {
let mut next = current.clone();
next.push(byte);
if dictionary.contains_key(&next) {
current = next;
} else {
if let Some(&code) = dictionary.get(¤t) {
result.push((code >> 8) as u8);
result.push(code as u8);
}
if dict_size < u16::MAX {
dictionary.insert(next, dict_size);
dict_size += 1;
}
current = vec![byte];
}
}
if !current.is_empty() {
if let Some(&code) = dictionary.get(¤t) {
result.push((code >> 8) as u8);
result.push(code as u8);
}
}
Ok(result)
}
fn compress_with_dictionary(
&self,
data: &[u8],
strategy: &CompressionStrategy,
) -> Result<Vec<u8>> {
if let Some(ref dictionary) = strategy.dictionary {
let mut result = data.to_vec();
for (i, dict_entry) in dictionary.chunks(16).enumerate() {
if dict_entry.len() >= 4 {
let pattern = &dict_entry[0..dict_entry.len().min(8)];
let replacement = vec![(i % 256) as u8];
if pattern.len() > replacement.len() {
result = result
.chunks(pattern.len())
.flat_map(|chunk| {
if chunk == pattern {
replacement.clone()
} else {
chunk.to_vec()
}
})
.collect();
}
}
}
Ok(result)
} else {
self.compress_flate(data, strategy.level)
}
}
pub fn decompress(&self, compressed_data: &CompressedData) -> Result<Vec<u8>> {
match compressed_data.algorithm {
CompressionAlgorithm::None => Ok(compressed_data.data.clone()),
CompressionAlgorithm::Flate => self.decompress_flate(&compressed_data.data),
CompressionAlgorithm::RunLength => self.decompress_run_length(&compressed_data.data),
CompressionAlgorithm::LZW => self.decompress_lzw(&compressed_data.data),
CompressionAlgorithm::Dictionary => {
self.decompress_flate(&compressed_data.data)
}
}
}
fn decompress_flate(&self, data: &[u8]) -> Result<Vec<u8>> {
use flate2::read::ZlibDecoder;
use std::io::Read;
const MAX_DECOMPRESSED: usize = 256 * 1024 * 1024;
let mut decoder = ZlibDecoder::new(data);
let mut decompressed = Vec::new();
let mut buffer = [0u8; 16384];
loop {
match decoder.read(&mut buffer) {
Ok(0) => break,
Ok(n) => {
if decompressed.len() + n > MAX_DECOMPRESSED {
return Err(crate::error::PdfError::CompressionError(format!(
"Decompressed size exceeds {} MB limit",
MAX_DECOMPRESSED / (1024 * 1024)
)));
}
decompressed.extend_from_slice(&buffer[..n]);
}
Err(e) => return Err(e.into()),
}
}
Ok(decompressed)
}
fn decompress_run_length(&self, data: &[u8]) -> Result<Vec<u8>> {
let mut decompressed = Vec::new();
let mut i = 0;
while i + 1 < data.len() {
let count = data[i];
let value = data[i + 1];
for _ in 0..count {
decompressed.push(value);
}
i += 2;
}
while i < data.len() {
decompressed.push(data[i]);
i += 1;
}
Ok(decompressed)
}
fn decompress_lzw(&self, _data: &[u8]) -> Result<Vec<u8>> {
Err(crate::error::PdfError::CompressionError(
"LZW decompression not fully implemented".to_string(),
))
}
pub fn stats(&self) -> &CompressionStats {
&self.stats
}
pub fn reset_stats(&mut self) {
self.stats = CompressionStats::default();
}
pub fn test_compression(
&mut self,
data: &[u8],
content_type: ContentType,
) -> CompressionTestResult {
let original_size = data.len();
let start = Instant::now();
let result = self.compress(data.to_vec(), content_type);
let test_time = start.elapsed();
let test_result = match result {
Ok(compressed) => Ok(CompressionSuccess {
compressed_size: compressed.compressed_size,
compression_ratio: compressed.compressed_size as f64 / original_size as f64,
algorithm_used: compressed.algorithm,
}),
Err(e) => Err(e.to_string()),
};
CompressionTestResult {
original_size,
compression_time: test_time,
result: test_result,
}
}
}
impl Default for IntelligentCompressor {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct CompressedData {
pub data: Vec<u8>,
pub algorithm: CompressionAlgorithm,
pub original_size: usize,
pub compressed_size: usize,
pub compression_time: Duration,
}
impl CompressedData {
pub fn compression_ratio(&self) -> f64 {
if self.original_size == 0 {
return 1.0;
}
self.compressed_size as f64 / self.original_size as f64
}
pub fn space_saved(&self) -> isize {
self.original_size as isize - self.compressed_size as isize
}
pub fn throughput_mbps(&self) -> f64 {
if self.compression_time.as_secs_f64() == 0.0 {
return 0.0;
}
let mb = self.original_size as f64 / (1024.0 * 1024.0);
mb / self.compression_time.as_secs_f64()
}
}
#[derive(Debug, Clone, Default)]
pub struct CompressionStats {
pub total_operations: u64,
pub total_original_size: usize,
pub total_compressed_size: usize,
pub total_compression_time: Duration,
pub by_content_type: HashMap<ContentType, ContentTypeStats>,
}
impl CompressionStats {
pub fn compression_ratio(&self) -> f64 {
if self.total_original_size == 0 {
return 1.0;
}
self.total_compressed_size as f64 / self.total_original_size as f64
}
pub fn total_space_saved(&self) -> isize {
self.total_original_size as isize - self.total_compressed_size as isize
}
pub fn average_throughput_mbps(&self) -> f64 {
if self.total_compression_time.as_secs_f64() == 0.0 {
return 0.0;
}
let total_mb = self.total_original_size as f64 / (1024.0 * 1024.0);
total_mb / self.total_compression_time.as_secs_f64()
}
pub fn best_compression_type(&self) -> Option<(ContentType, f64)> {
self.by_content_type
.iter()
.min_by(|(_, a), (_, b)| a.compression_ratio().total_cmp(&b.compression_ratio()))
.map(|(&content_type, stats)| (content_type, stats.compression_ratio()))
}
pub fn summary(&self) -> String {
let space_saved_mb = self.total_space_saved() as f64 / (1024.0 * 1024.0);
let best_type = self
.best_compression_type()
.map(|(t, r)| format!("{:?} ({:.1}%)", t, (1.0 - r) * 100.0))
.unwrap_or_else(|| "None".to_string());
format!(
"Compression Stats:\n\
- Total Operations: {}\n\
- Original Size: {:.1} MB\n\
- Compressed Size: {:.1} MB\n\
- Space Saved: {:.1} MB\n\
- Compression Ratio: {:.1}%\n\
- Average Throughput: {:.1} MB/s\n\
- Best Content Type: {}\n\
- Content Types Processed: {}",
self.total_operations,
self.total_original_size as f64 / (1024.0 * 1024.0),
self.total_compressed_size as f64 / (1024.0 * 1024.0),
space_saved_mb,
self.compression_ratio() * 100.0,
self.average_throughput_mbps(),
best_type,
self.by_content_type.len()
)
}
}
#[derive(Debug, Clone, Default)]
pub struct ContentTypeStats {
pub operations: u64,
pub original_size: usize,
pub compressed_size: usize,
pub compression_time: Duration,
}
impl ContentTypeStats {
pub fn compression_ratio(&self) -> f64 {
if self.original_size == 0 {
return 1.0;
}
self.compressed_size as f64 / self.original_size as f64
}
}
#[derive(Debug)]
pub struct CompressionTestResult {
pub original_size: usize,
pub compression_time: Duration,
pub result: std::result::Result<CompressionSuccess, String>,
}
#[derive(Debug)]
pub struct CompressionSuccess {
pub compressed_size: usize,
pub compression_ratio: f64,
pub algorithm_used: CompressionAlgorithm,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_content_type_analysis() {
let jpeg_data = vec![0xFF, 0xD8, 0xFF, 0xE0];
assert_eq!(ContentType::analyze(&jpeg_data), ContentType::ImageJpeg);
let png_data = vec![0x89, 0x50, 0x4E, 0x47];
assert_eq!(ContentType::analyze(&png_data), ContentType::ImagePng);
let text_data = b"This is some text content with PDF operators BT ET Td Tj";
assert_eq!(ContentType::analyze(text_data), ContentType::ContentStream);
let metadata = b"/Type /Catalog /Creator (Test)";
assert_eq!(ContentType::analyze(metadata), ContentType::Metadata);
}
#[test]
fn test_compression_strategy_creation() {
let strategy = CompressionStrategy::default();
assert_eq!(strategy.algorithm, CompressionAlgorithm::Flate);
assert_eq!(strategy.level, 6);
assert!(strategy.should_compress);
let max_strategy = CompressionStrategy::max_compression();
assert_eq!(max_strategy.level, 9);
let fast_strategy = CompressionStrategy::fast_compression();
assert_eq!(fast_strategy.level, 1);
let no_compression = CompressionStrategy::no_compression();
assert!(!no_compression.should_compress);
}
#[test]
fn test_intelligent_compressor_creation() {
let compressor = IntelligentCompressor::new();
assert!(compressor.strategies.contains_key(&ContentType::Text));
assert!(compressor.strategies.contains_key(&ContentType::ImageJpeg));
assert!(compressor.strategies.contains_key(&ContentType::FontData));
}
#[test]
fn test_compression_basic() {
let mut compressor = IntelligentCompressor::new();
let text_data = b"Hello, World! This is some test text data.".to_vec();
let result = compressor.compress(text_data.clone(), ContentType::Text);
assert!(result.is_ok());
let compressed = result.unwrap();
assert_eq!(compressed.original_size, text_data.len());
assert!(compressed.compressed_size <= text_data.len());
}
#[test]
fn test_jpeg_no_compression() {
let mut compressor = IntelligentCompressor::new();
let jpeg_data = vec![0xFF, 0xD8, 0xFF, 0xE0, 1, 2, 3, 4];
let result = compressor.compress(jpeg_data.clone(), ContentType::ImageJpeg);
assert!(result.is_ok());
let compressed = result.unwrap();
assert_eq!(compressed.algorithm, CompressionAlgorithm::None);
assert_eq!(compressed.compressed_size, jpeg_data.len());
}
#[test]
fn test_run_length_compression() {
let compressor = IntelligentCompressor::new();
let data = vec![1, 1, 1, 1, 2, 3, 3, 3];
let result = compressor.compress_run_length(&data);
assert!(result.is_ok());
let compressed = result.unwrap();
assert!(compressed.len() <= data.len());
}
#[test]
fn test_compression_stats() {
let mut compressor = IntelligentCompressor::new();
let text1 = b"First text document".to_vec();
let text2 = b"Second text document".to_vec();
let _ = compressor.compress(text1, ContentType::Text);
let _ = compressor.compress(text2, ContentType::Text);
let stats = compressor.stats();
assert_eq!(stats.total_operations, 2);
assert!(stats.total_original_size > 0);
assert!(stats.by_content_type.contains_key(&ContentType::Text));
}
#[test]
fn test_compressed_data_metrics() {
let compressed = CompressedData {
data: vec![1, 2, 3],
algorithm: CompressionAlgorithm::Flate,
original_size: 10,
compressed_size: 5,
compression_time: Duration::from_millis(10),
};
assert_eq!(compressed.compression_ratio(), 0.5);
assert_eq!(compressed.space_saved(), 5);
assert!(compressed.throughput_mbps() > 0.0);
}
#[test]
fn test_decompression_basic() {
let mut compressor = IntelligentCompressor::new();
let original_data = b"Test data for compression and decompression".to_vec();
let compressed = compressor
.compress(original_data.clone(), ContentType::Text)
.unwrap();
let decompressed = compressor.decompress(&compressed).unwrap();
assert_eq!(decompressed, original_data);
}
#[test]
fn test_compression_test_result() {
let mut compressor = IntelligentCompressor::new();
let test_data = b"Test data for compression testing".as_slice();
let result = compressor.test_compression(test_data, ContentType::Text);
assert!(result.result.is_ok());
assert_eq!(result.original_size, test_data.len());
if let Ok(success) = &result.result {
assert!(success.compression_ratio <= 1.0);
}
assert!(result.compression_time > Duration::ZERO);
}
#[test]
fn test_statistics_summary() {
let mut compressor = IntelligentCompressor::new();
let _ = compressor.compress(b"Hello World".to_vec(), ContentType::Text);
let _ = compressor.compress(vec![0xFF, 0xD8, 0xFF, 0xE0], ContentType::ImageJpeg);
let stats = compressor.stats();
let summary = stats.summary();
assert!(summary.contains("Total Operations: 2"));
assert!(summary.contains("Content Types Processed:"));
}
#[test]
fn test_content_type_stats() {
let mut stats = ContentTypeStats::default();
stats.operations = 5;
stats.original_size = 1000;
stats.compressed_size = 600;
assert_eq!(stats.compression_ratio(), 0.6);
}
}