use crate::decoders::StreamDecoder;
use crate::error::{Error, Result};
use flate2::read::{DeflateDecoder, ZlibDecoder};
use std::io::Read;
pub const DEFAULT_MAX_DECOMPRESSED_BYTES: u64 = 256 * 1024 * 1024;
fn effective_limit() -> u64 {
std::env::var("PDF_OXIDE_MAX_DECOMPRESS_MB")
.ok()
.and_then(|v| v.parse::<u64>().ok())
.map(|mb| mb * 1024 * 1024)
.unwrap_or(DEFAULT_MAX_DECOMPRESSED_BYTES)
}
fn looks_like_real_stream(output: &[u8]) -> bool {
if output.is_empty() {
return false;
}
const MARKERS: &[&[u8]] = &[
b"BT", b"ET", b"Tj", b"TJ", b"Tm", b"Td", b"stream", b"endobj", b"%PDF-",
];
for m in MARKERS {
if output.windows(m.len()).any(|w| w == *m) {
return true;
}
}
let printable = output
.iter()
.filter(|&&b| (0x20..=0x7E).contains(&b) || b == b'\t' || b == b'\n' || b == b'\r')
.count();
printable * 5 >= output.len() * 4
}
#[inline]
fn check_limit(output: &[u8], limit: u64) -> Result<()> {
if output.len() as u64 >= limit {
return Err(Error::Decode(format!(
"FlateDecode output reached the {} MB safety limit; \
stream may be a flate bomb or an unusually large image",
limit / (1024 * 1024)
)));
}
Ok(())
}
pub struct FlateDecoder {
pub max_decompressed_bytes: u64,
}
impl Default for FlateDecoder {
fn default() -> Self {
Self {
max_decompressed_bytes: effective_limit(),
}
}
}
impl FlateDecoder {
pub fn with_limit(limit: u64) -> Self {
Self {
max_decompressed_bytes: limit,
}
}
}
impl StreamDecoder for FlateDecoder {
fn decode(&self, input: &[u8]) -> Result<Vec<u8>> {
let mut decoder = ZlibDecoder::new(input).take(self.max_decompressed_bytes);
let mut output = Vec::new();
match decoder.read_to_end(&mut output) {
Ok(_) => {
check_limit(&output, self.max_decompressed_bytes)?;
Ok(output)
},
Err(e) => {
if !output.is_empty() && looks_like_real_stream(&output) {
check_limit(&output, self.max_decompressed_bytes)?;
log::warn!(
"FlateDecode partial recovery: extracted {} bytes before corruption: {}",
output.len(),
e
);
return Ok(output);
}
log::info!("Zlib decode failed, trying raw deflate");
output.clear();
let mut deflate_decoder =
DeflateDecoder::new(input).take(self.max_decompressed_bytes);
match deflate_decoder.read_to_end(&mut output) {
Ok(_) => {
check_limit(&output, self.max_decompressed_bytes)?;
log::info!("Raw deflate recovery succeeded: {} bytes", output.len());
Ok(output)
},
Err(deflate_err) => {
if !output.is_empty() && looks_like_real_stream(&output) {
check_limit(&output, self.max_decompressed_bytes)?;
log::warn!(
"Raw deflate partial recovery: extracted {} bytes before error",
output.len()
);
return Ok(output);
}
if input.len() > 2 {
log::info!(
"Trying deflate after skipping potential corrupt zlib header"
);
output.clear();
let mut deflate_decoder =
DeflateDecoder::new(&input[2..]).take(self.max_decompressed_bytes);
match deflate_decoder.read_to_end(&mut output) {
Ok(_) => {
check_limit(&output, self.max_decompressed_bytes)?;
log::info!(
"Deflate with header skip succeeded: {} bytes",
output.len()
);
return Ok(output);
},
Err(_) => {
if !output.is_empty() && looks_like_real_stream(&output) {
check_limit(&output, self.max_decompressed_bytes)?;
log::warn!(
"Deflate with header skip partial recovery: {} bytes",
output.len()
);
return Ok(output);
}
},
}
}
if input.len() >= 2 {
let first_byte = input[0];
let compression_method = first_byte & 0x0F;
if compression_method != 8 {
log::info!(
"Detected invalid compression method {} in header byte 0x{:02x}, trying with corrected header",
compression_method,
first_byte
);
let mut corrected = input.to_vec();
corrected[0] = (first_byte & 0xF0) | 0x08;
output.clear();
let mut decoder = ZlibDecoder::new(&corrected[..])
.take(self.max_decompressed_bytes);
match decoder.read_to_end(&mut output) {
Ok(_) if !output.is_empty() => {
check_limit(&output, self.max_decompressed_bytes)?;
log::info!(
"Header correction recovery succeeded: {} bytes",
output.len()
);
return Ok(output);
},
Err(_)
if !output.is_empty()
&& looks_like_real_stream(&output) =>
{
check_limit(&output, self.max_decompressed_bytes)?;
log::warn!(
"Header correction partial recovery: {} bytes",
output.len()
);
return Ok(output);
},
_ => {
log::info!("Header correction failed");
},
}
}
}
log::info!("Trying brute-force scan for valid deflate data");
let max_offset = std::cmp::min(20, input.len());
for offset in 0..max_offset {
if offset == 0 || offset == 2 {
continue; }
output.clear();
let mut deflate_decoder = DeflateDecoder::new(&input[offset..])
.take(self.max_decompressed_bytes);
match deflate_decoder.read_to_end(&mut output) {
Ok(_) if !output.is_empty() => {
check_limit(&output, self.max_decompressed_bytes)?;
let decoded_str = String::from_utf8_lossy(&output);
let has_pdf_operators = decoded_str.contains("BT")
|| decoded_str.contains("ET")
|| decoded_str.contains("Tj")
|| decoded_str.contains("TJ")
|| decoded_str.contains("Tm")
|| decoded_str.contains("Td");
if has_pdf_operators {
log::info!(
"Brute-force deflate recovery succeeded at offset {}: {} bytes (validated PDF content)",
offset,
output.len()
);
return Ok(output);
} else {
log::info!(
"Brute-force at offset {} produced {} bytes but no valid PDF operators - trying next offset",
offset,
output.len()
);
continue;
}
},
Err(_) if !output.is_empty() => {
check_limit(&output, self.max_decompressed_bytes)?;
let decoded_str = String::from_utf8_lossy(&output);
let has_pdf_operators = decoded_str.contains("BT")
|| decoded_str.contains("ET")
|| decoded_str.contains("Tj")
|| decoded_str.contains("TJ")
|| decoded_str.contains("Tm")
|| decoded_str.contains("Td");
if has_pdf_operators {
log::warn!(
"Brute-force partial recovery at offset {}: {} bytes (validated PDF content)",
offset,
output.len()
);
return Ok(output);
} else {
log::info!(
"Partial recovery at offset {} but no valid PDF operators - trying next offset",
offset
);
continue;
}
},
_ => continue,
}
}
log::error!(
"All FlateDecode recovery strategies failed. Zlib: {}, Deflate: {}",
e,
deflate_err
);
log::error!(
"Stream labeled as FlateDecode but cannot be decompressed - this violates PDF spec"
);
Err(Error::Decode(format!(
"FlateDecode decompression failed: stream is labeled as compressed but all decompression attempts failed. \
This violates PDF Spec ISO 32000-1:2008, Section 7.3.8.2. \
Zlib error: {}, Deflate error: {}. Compressed size: {} bytes.",
e,
deflate_err,
input.len()
)))
},
}
},
}
}
fn name(&self) -> &str {
"FlateDecode"
}
}
#[cfg(test)]
mod tests {
use super::*;
use flate2::write::ZlibEncoder;
use flate2::Compression;
use std::io::Write;
#[test]
fn looks_like_real_stream_rejects_repeating_garbage() {
let garbage = b"P\xffj!}\xef\xbd\xbd\xef\xbd\xbd\xef\xbd\xbd\xef\xbd\xbd\xef\xbd\xbd\xef\xbd\xbd\xef\xbd\xbd\xef\xbd\xbd".repeat(4);
assert!(
!looks_like_real_stream(&garbage),
"misaligned-deflate garbage must be rejected as a partial recovery"
);
}
#[test]
fn looks_like_real_stream_accepts_content_stream_operators() {
let real = b"BT /F1 12 Tf 100 700 Td (hello) Tj ET";
assert!(looks_like_real_stream(real));
}
#[test]
fn looks_like_real_stream_accepts_ascii_only_object_stream() {
let object_stream = b"1 0 obj\n<< /Length 42 >>\nstream\nhello world\nendstream\nendobj\n";
assert!(looks_like_real_stream(object_stream));
}
#[test]
fn looks_like_real_stream_rejects_empty() {
assert!(!looks_like_real_stream(&[]));
}
#[test]
fn test_flate_decode_simple() {
let decoder = FlateDecoder::default();
let original = b"Hello, FlateDecode!";
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
encoder.write_all(original).unwrap();
let compressed = encoder.finish().unwrap();
let decoded = decoder.decode(&compressed).unwrap();
assert_eq!(decoded, original);
}
#[test]
fn test_flate_decode_empty() {
let decoder = FlateDecoder::default();
let original = b"";
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
encoder.write_all(original).unwrap();
let compressed = encoder.finish().unwrap();
let decoded = decoder.decode(&compressed).unwrap();
assert_eq!(decoded, original);
}
#[test]
fn test_flate_decode_large_data() {
let decoder = FlateDecoder::default();
let original = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ".repeat(1000);
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
encoder.write_all(&original).unwrap();
let compressed = encoder.finish().unwrap();
let decoded = decoder.decode(&compressed).unwrap();
assert_eq!(decoded, original);
}
#[test]
fn test_flate_decode_invalid_data() {
let decoder = FlateDecoder::default();
let invalid = b"This is not zlib compressed data";
let result = decoder.decode(invalid);
assert!(result.is_err());
if let Err(e) = result {
let error_msg = format!("{}", e);
assert!(error_msg.contains("FlateDecode decompression failed"));
}
}
#[test]
fn test_flate_decoder_name() {
let decoder = FlateDecoder::default();
assert_eq!(decoder.name(), "FlateDecode");
}
#[test]
fn test_flate_bomb_rejected() {
let large = vec![0u8; DEFAULT_MAX_DECOMPRESSED_BYTES as usize];
let result = check_limit(&large, DEFAULT_MAX_DECOMPRESSED_BYTES);
assert!(result.is_err());
let msg = format!("{}", result.unwrap_err());
assert!(msg.contains("safety limit"));
}
#[test]
fn test_check_limit_below_threshold() {
let small = vec![0u8; 1024];
assert!(check_limit(&small, DEFAULT_MAX_DECOMPRESSED_BYTES).is_ok());
}
#[test]
fn test_custom_limit_accepts_data_within_limit() {
let original = b"x".repeat(512);
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
encoder.write_all(&original).unwrap();
let compressed = encoder.finish().unwrap();
let decoder = FlateDecoder::with_limit(1024);
let decoded = decoder.decode(&compressed).unwrap();
assert_eq!(decoded, original);
}
#[test]
fn test_custom_limit_rejects_data_over_limit() {
let original = b"x".repeat(100);
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
encoder.write_all(&original).unwrap();
let compressed = encoder.finish().unwrap();
let decoder = FlateDecoder::with_limit(10);
let result = decoder.decode(&compressed);
assert!(result.is_err(), "expected rejection when output exceeds custom limit");
}
#[test]
fn test_bomb_error_does_not_expose_internal_symbol_name() {
let large = vec![0u8; DEFAULT_MAX_DECOMPRESSED_BYTES as usize];
let result = check_limit(&large, DEFAULT_MAX_DECOMPRESSED_BYTES);
assert!(result.is_err());
let msg = format!("{}", result.unwrap_err());
assert!(
!msg.contains("MAX_DECOMPRESSED_BYTES"),
"error message must not reference internal symbol names: {msg}"
);
}
#[test]
fn test_effective_limit_env_variable() {
std::env::remove_var("PDF_OXIDE_MAX_DECOMPRESS_MB");
assert_eq!(effective_limit(), DEFAULT_MAX_DECOMPRESSED_BYTES);
unsafe { std::env::set_var("PDF_OXIDE_MAX_DECOMPRESS_MB", "64") };
assert_eq!(effective_limit(), 64 * 1024 * 1024);
unsafe { std::env::set_var("PDF_OXIDE_MAX_DECOMPRESS_MB", "not_a_number") };
assert_eq!(effective_limit(), DEFAULT_MAX_DECOMPRESSED_BYTES);
unsafe { std::env::remove_var("PDF_OXIDE_MAX_DECOMPRESS_MB") };
}
}