use super::objects::{PdfDictionary, PdfObject};
use super::{ParseError, ParseOptions, ParseResult};
#[cfg(feature = "compression")]
use flate2::read::ZlibDecoder;
use std::io::Read;
const MAX_DECOMPRESSED_SIZE: usize = 256 * 1024 * 1024;
const MAX_COMPRESSION_RATIO: usize = 1000;
fn read_to_end_limited<R: Read>(reader: &mut R, max_bytes: usize) -> std::io::Result<Vec<u8>> {
let mut result = Vec::new();
let mut buffer = [0u8; 16384];
loop {
match reader.read(&mut buffer) {
Ok(0) => break,
Ok(n) => {
if result.len() + n > max_bytes {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
format!(
"Decompressed size exceeds limit of {} bytes ({} MB). \
Possible decompression bomb.",
max_bytes,
max_bytes / (1024 * 1024)
),
));
}
result.extend_from_slice(&buffer[..n]);
}
Err(e) => return Err(e),
}
}
Ok(result)
}
fn check_compression_ratio(input_size: usize, output_size: usize) -> Result<(), std::io::Error> {
if input_size > 0 && output_size / input_size > MAX_COMPRESSION_RATIO {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
format!(
"Suspicious compression ratio {}:1 (input={}B, output={}B). \
Max allowed ratio is {}:1.",
output_size / input_size,
input_size,
output_size,
MAX_COMPRESSION_RATIO
),
));
}
Ok(())
}
use super::filter_impls::ccitt::decode_ccitt;
use super::filter_impls::dct::decode_dct;
use super::filter_impls::jbig2::decode_jbig2;
pub use super::filter_impls::ccitt::decode_ccitt as decode_ccitt_public;
pub use super::filter_impls::dct::{parse_jpeg_info, JpegColorSpace, JpegInfo};
pub use super::filter_impls::jbig2::decode_jbig2 as decode_jbig2_public;
#[derive(Debug, Clone, PartialEq)]
pub enum Filter {
ASCIIHexDecode,
ASCII85Decode,
LZWDecode,
FlateDecode,
RunLengthDecode,
CCITTFaxDecode,
JBIG2Decode,
DCTDecode,
JPXDecode,
Crypt,
}
impl Filter {
pub fn from_name(name: &str) -> Option<Self> {
match name {
"ASCIIHexDecode" => Some(Filter::ASCIIHexDecode),
"ASCII85Decode" => Some(Filter::ASCII85Decode),
"LZWDecode" => Some(Filter::LZWDecode),
"FlateDecode" => Some(Filter::FlateDecode),
"RunLengthDecode" => Some(Filter::RunLengthDecode),
"CCITTFaxDecode" => Some(Filter::CCITTFaxDecode),
"JBIG2Decode" => Some(Filter::JBIG2Decode),
"DCTDecode" => Some(Filter::DCTDecode),
"JPXDecode" => Some(Filter::JPXDecode),
"Crypt" => Some(Filter::Crypt),
_ => None,
}
}
}
pub fn decode_stream(
data: &[u8],
dict: &PdfDictionary,
_options: &ParseOptions,
) -> ParseResult<Vec<u8>> {
let filters = match dict.get("Filter") {
Some(PdfObject::Name(name)) => vec![name.as_str()],
Some(PdfObject::Array(array)) => {
let mut filter_names = Vec::new();
for obj in &array.0 {
if let PdfObject::Name(name) = obj {
filter_names.push(name.as_str());
} else {
return Err(ParseError::SyntaxError {
position: 0,
message: "Invalid filter in array".to_string(),
});
}
}
filter_names
}
None => {
return Ok(data.to_vec());
}
_ => {
return Err(ParseError::SyntaxError {
position: 0,
message: "Invalid Filter type".to_string(),
});
}
};
let decode_params = dict.get("DecodeParms");
let mut result = data.to_vec();
for (i, filter_name) in filters.iter().enumerate() {
let filter = Filter::from_name(filter_name).ok_or_else(|| ParseError::SyntaxError {
position: 0,
message: format!("Unknown filter: {filter_name}"),
})?;
let filter_params = get_filter_params(decode_params, i);
result = apply_filter_with_params(&result, filter, filter_params)?;
}
Ok(result)
}
#[allow(dead_code)]
pub(crate) fn apply_filter(data: &[u8], filter: Filter) -> ParseResult<Vec<u8>> {
match filter {
Filter::FlateDecode => decode_flate(data),
Filter::ASCIIHexDecode => decode_ascii_hex(data),
Filter::ASCII85Decode => decode_ascii85(data),
Filter::LZWDecode => decode_lzw(data, None),
Filter::RunLengthDecode => decode_run_length(data),
Filter::CCITTFaxDecode => decode_ccitt(data, None),
Filter::JBIG2Decode => decode_jbig2(data, None),
Filter::DCTDecode => decode_dct(data),
_ => Err(ParseError::SyntaxError {
position: 0,
message: format!("Filter {filter:?} not yet implemented"),
}),
}
}
#[cfg(feature = "compression")]
fn decode_flate(data: &[u8]) -> ParseResult<Vec<u8>> {
if let Ok(result) = try_standard_zlib_decode(data) {
return Ok(result);
}
if let Ok(result) = try_raw_deflate_decode(data) {
return Ok(result);
}
if data.len() > 10 {
for skip_bytes in 1..=5 {
if let Ok(result) = try_standard_zlib_decode(&data[skip_bytes..]) {
return Ok(result);
}
if let Ok(result) = try_raw_deflate_decode(&data[skip_bytes..]) {
return Ok(result);
}
}
}
if data.len() > 20 {
for truncate_bytes in 1..=10 {
let truncated = &data[..data.len() - truncate_bytes];
if let Ok(result) = try_standard_zlib_decode(truncated) {
return Ok(result);
}
if let Ok(result) = try_raw_deflate_decode(truncated) {
return Ok(result);
}
}
}
if let Ok(result) = try_gzip_decode(data) {
return Ok(result);
}
if let Ok(partial) = try_partial_flate_decode(data) {
tracing::debug!(
"Warning: Using partial FlateDecode recovery, {} bytes recovered",
partial.len()
);
return Ok(partial);
}
if data.len() > 20 {
for predictor in [10, 11, 12, 13, 14, 15] {
if let Ok(result) = try_flate_decode_with_predictor(data, predictor) {
tracing::debug!(
"Warning: FlateDecode succeeded with predictor {}",
predictor
);
return Ok(result);
}
}
}
tracing::debug!("Warning: All FlateDecode strategies failed, returning empty data");
Ok(Vec::new())
}
#[cfg(feature = "compression")]
fn try_standard_zlib_decode(data: &[u8]) -> Result<Vec<u8>, std::io::Error> {
let mut decoder = ZlibDecoder::new(data);
let result = read_to_end_limited(&mut decoder, MAX_DECOMPRESSED_SIZE)?;
check_compression_ratio(data.len(), result.len())?;
Ok(result)
}
#[cfg(feature = "compression")]
fn try_raw_deflate_decode(data: &[u8]) -> Result<Vec<u8>, std::io::Error> {
use flate2::read::DeflateDecoder;
let mut decoder = DeflateDecoder::new(data);
let result = read_to_end_limited(&mut decoder, MAX_DECOMPRESSED_SIZE)?;
check_compression_ratio(data.len(), result.len())?;
Ok(result)
}
#[cfg(feature = "compression")]
fn try_gzip_decode(data: &[u8]) -> Result<Vec<u8>, std::io::Error> {
use flate2::read::GzDecoder;
let mut decoder = GzDecoder::new(data);
let result = read_to_end_limited(&mut decoder, MAX_DECOMPRESSED_SIZE)?;
check_compression_ratio(data.len(), result.len())?;
Ok(result)
}
#[cfg(feature = "compression")]
fn try_partial_flate_decode(data: &[u8]) -> Result<Vec<u8>, std::io::Error> {
use flate2::read::ZlibDecoder;
use std::io::ErrorKind;
let mut decoder = ZlibDecoder::new(data);
let mut result = Vec::new();
let mut buffer = [0; 8192];
loop {
match decoder.read(&mut buffer) {
Ok(0) => break, Ok(n) => {
if result.len() + n > MAX_DECOMPRESSED_SIZE {
return Err(std::io::Error::new(
ErrorKind::Other,
format!(
"Partial decompression exceeds {} MB limit",
MAX_DECOMPRESSED_SIZE / (1024 * 1024)
),
));
}
result.extend_from_slice(&buffer[..n]);
}
Err(e) if e.kind() == ErrorKind::UnexpectedEof => {
if !result.is_empty() {
check_compression_ratio(data.len(), result.len())?;
return Ok(result);
}
return Err(e);
}
Err(e) => return Err(e),
}
}
if result.is_empty() {
Err(std::io::Error::new(
ErrorKind::InvalidData,
"No data decoded",
))
} else {
check_compression_ratio(data.len(), result.len())?;
Ok(result)
}
}
#[cfg(feature = "compression")]
fn try_flate_decode_with_predictor(data: &[u8], predictor: u8) -> Result<Vec<u8>, std::io::Error> {
use flate2::read::ZlibDecoder;
let mut decoder = ZlibDecoder::new(data);
let raw_data = read_to_end_limited(&mut decoder, MAX_DECOMPRESSED_SIZE)?;
check_compression_ratio(data.len(), raw_data.len())?;
if predictor >= 10 && predictor <= 15 {
apply_png_predictor(&raw_data, predictor)
} else {
Ok(raw_data)
}
}
#[cfg(feature = "compression")]
fn apply_png_predictor(data: &[u8], predictor: u8) -> Result<Vec<u8>, std::io::Error> {
if data.is_empty() {
return Ok(data.to_vec());
}
let common_widths = [1, 2, 3, 4, 8, 16, 24, 32, 48, 64, 96, 128];
for &width in &common_widths {
if let Ok(result) = apply_png_predictor_with_width(data, predictor, width) {
if result.len() > data.len() / 2 && result.len() < data.len() * 2 {
return Ok(result);
}
}
}
Ok(data.to_vec())
}
#[cfg(feature = "compression")]
fn apply_png_predictor_with_width(
data: &[u8],
_predictor: u8,
width: usize,
) -> Result<Vec<u8>, std::io::Error> {
use std::io::{Error, ErrorKind};
if width == 0 || data.len() % (width + 1) != 0 {
return Err(Error::new(ErrorKind::InvalidInput, "Invalid width"));
}
let mut result = Vec::new();
let row_len = width + 1;
for row_data in data.chunks_exact(row_len) {
if row_data.is_empty() {
continue;
}
let predictor_byte = row_data[0];
let row = &row_data[1..];
match predictor_byte {
0 => {
result.extend_from_slice(row);
}
1 => {
result.push(row[0]);
for i in 1..row.len() {
let prev = if i >= width {
result[result.len() - width]
} else {
0
};
result.push(row[i].wrapping_add(prev));
}
}
2 => {
for i in 0..row.len() {
let up = if result.len() >= width {
result[result.len() - width + i]
} else {
0
};
result.push(row[i].wrapping_add(up));
}
}
_ => {
result.extend_from_slice(row);
}
}
}
Ok(result)
}
#[cfg(not(feature = "compression"))]
fn decode_flate(_data: &[u8]) -> ParseResult<Vec<u8>> {
Err(ParseError::StreamDecodeError(
"FlateDecode requires 'compression' feature".to_string(),
))
}
fn decode_ascii_hex(data: &[u8]) -> ParseResult<Vec<u8>> {
let mut result = Vec::new();
let mut chars = data.iter().filter(|&&b| !b.is_ascii_whitespace());
loop {
let high = match chars.next() {
Some(&b'>') => break, Some(&ch) => ch,
None => break,
};
let low = match chars.next() {
Some(&b'>') => {
b'0'
}
Some(&ch) => ch,
None => b'0', };
let high_val = hex_digit_value(high).ok_or_else(|| {
ParseError::StreamDecodeError(format!("Invalid hex digit: {}", high as char))
})?;
let low_val = hex_digit_value(low).ok_or_else(|| {
ParseError::StreamDecodeError(format!("Invalid hex digit: {}", low as char))
})?;
result.push((high_val << 4) | low_val);
if low == b'>' {
break;
}
}
Ok(result)
}
fn hex_digit_value(ch: u8) -> Option<u8> {
match ch {
b'0'..=b'9' => Some(ch - b'0'),
b'A'..=b'F' => Some(ch - b'A' + 10),
b'a'..=b'f' => Some(ch - b'a' + 10),
_ => None,
}
}
fn decode_ascii85(data: &[u8]) -> ParseResult<Vec<u8>> {
let mut result = Vec::new();
let mut chars = data.iter().filter(|&&b| !b.is_ascii_whitespace());
let mut group = Vec::with_capacity(5);
let mut ch = match chars.next() {
Some(&b'<') => {
if chars.next() == Some(&b'~') {
chars.next()
} else {
Some(&b'<')
}
}
other => other,
};
while let Some(&c) = ch {
match c {
b'~' => {
if chars.next() == Some(&b'>') {
break;
} else {
return Err(ParseError::StreamDecodeError(
"Invalid ASCII85 end marker".to_string(),
));
}
}
b'z' if group.is_empty() => {
result.extend_from_slice(&[0, 0, 0, 0]);
}
b'!'..=b'u' => {
group.push(c);
if group.len() == 5 {
let value = group
.iter()
.enumerate()
.map(|(i, &ch)| (ch - b'!') as u32 * 85u32.pow(4 - i as u32))
.sum::<u32>();
result.push((value >> 24) as u8);
result.push((value >> 16) as u8);
result.push((value >> 8) as u8);
result.push(value as u8);
group.clear();
}
}
_ => {
return Err(ParseError::StreamDecodeError(format!(
"Invalid ASCII85 character: {}",
c as char
)));
}
}
ch = chars.next();
}
if !group.is_empty() {
let original_len = group.len();
while group.len() < 5 {
group.push(b'u');
}
let value = group
.iter()
.enumerate()
.map(|(i, &ch)| (ch - b'!') as u32 * 85u32.pow(4 - i as u32))
.sum::<u32>();
let output_bytes = original_len - 1;
for i in 0..output_bytes {
result.push((value >> (24 - 8 * i)) as u8);
}
}
Ok(result)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::objects::{PdfArray, PdfDictionary, PdfName, PdfObject};
#[test]
fn test_ascii_hex_decode() {
let data = b"48656C6C6F>";
let result = decode_ascii_hex(data).unwrap();
assert_eq!(result, b"Hello");
let data = b"48 65 6C 6C 6F>"; let result = decode_ascii_hex(data).unwrap();
assert_eq!(result, b"Hello");
let data = b"48656C6C6>"; let result = decode_ascii_hex(data).unwrap();
assert_eq!(result, b"Hell`");
}
#[test]
fn test_ascii85_decode() {
let data = b"87cURD]j7BEbo80~>";
let result = decode_ascii85(data).unwrap();
assert_eq!(result, b"Hello world!");
let data = b"z~>"; let result = decode_ascii85(data).unwrap();
assert_eq!(result, &[0, 0, 0, 0]);
}
#[test]
fn test_filter_from_name() {
assert_eq!(
Filter::from_name("ASCIIHexDecode"),
Some(Filter::ASCIIHexDecode)
);
assert_eq!(
Filter::from_name("ASCII85Decode"),
Some(Filter::ASCII85Decode)
);
assert_eq!(Filter::from_name("LZWDecode"), Some(Filter::LZWDecode));
assert_eq!(Filter::from_name("FlateDecode"), Some(Filter::FlateDecode));
assert_eq!(
Filter::from_name("RunLengthDecode"),
Some(Filter::RunLengthDecode)
);
assert_eq!(
Filter::from_name("CCITTFaxDecode"),
Some(Filter::CCITTFaxDecode)
);
assert_eq!(Filter::from_name("JBIG2Decode"), Some(Filter::JBIG2Decode));
assert_eq!(Filter::from_name("DCTDecode"), Some(Filter::DCTDecode));
assert_eq!(Filter::from_name("JPXDecode"), Some(Filter::JPXDecode));
assert_eq!(Filter::from_name("Crypt"), Some(Filter::Crypt));
assert_eq!(Filter::from_name("UnknownFilter"), None);
}
#[test]
fn test_filter_equality() {
assert_eq!(Filter::ASCIIHexDecode, Filter::ASCIIHexDecode);
assert_ne!(Filter::ASCIIHexDecode, Filter::ASCII85Decode);
assert_ne!(Filter::FlateDecode, Filter::LZWDecode);
}
#[test]
fn test_filter_clone() {
let filter = Filter::FlateDecode;
let cloned = filter.clone();
assert_eq!(filter, cloned);
}
#[test]
fn test_decode_stream_no_filter() {
let data = b"Hello, world!";
let dict = PdfDictionary::new();
let result = decode_stream(data, &dict, &ParseOptions::default()).unwrap();
assert_eq!(result, data);
}
#[test]
fn test_decode_stream_single_filter() {
let data = b"48656C6C6F>";
let mut dict = PdfDictionary::new();
dict.insert(
"Filter".to_string(),
PdfObject::Name(PdfName("ASCIIHexDecode".to_string())),
);
let result = decode_stream(data, &dict, &ParseOptions::default()).unwrap();
assert_eq!(result, b"Hello");
}
#[test]
fn test_decode_stream_invalid_filter() {
let data = b"test data";
let mut dict = PdfDictionary::new();
dict.insert(
"Filter".to_string(),
PdfObject::Name(PdfName("UnknownFilter".to_string())),
);
let result = decode_stream(data, &dict, &ParseOptions::default());
assert!(result.is_err());
}
#[test]
fn test_decode_stream_filter_array() {
let data = b"48656C6C6F>";
let mut dict = PdfDictionary::new();
let filters = vec![PdfObject::Name(PdfName("ASCIIHexDecode".to_string()))];
dict.insert("Filter".to_string(), PdfObject::Array(PdfArray(filters)));
let result = decode_stream(data, &dict, &ParseOptions::default()).unwrap();
assert_eq!(result, b"Hello");
}
#[test]
fn test_decode_stream_invalid_filter_type() {
let data = b"test data";
let mut dict = PdfDictionary::new();
dict.insert("Filter".to_string(), PdfObject::Integer(42));
let result = decode_stream(data, &dict, &ParseOptions::default());
assert!(result.is_err());
}
#[test]
fn test_ascii_hex_decode_empty() {
let data = b">";
let result = decode_ascii_hex(data).unwrap();
assert!(result.is_empty());
}
#[test]
fn test_ascii_hex_decode_invalid() {
let data = b"GG>"; let result = decode_ascii_hex(data);
assert!(result.is_err());
}
#[test]
fn test_ascii_hex_decode_no_terminator() {
let data = b"48656C6C6F"; let result = decode_ascii_hex(data).unwrap();
assert_eq!(result, b"Hello"); }
#[test]
fn test_ascii85_decode_empty() {
let data = b"~>";
let result = decode_ascii85(data).unwrap();
assert!(result.is_empty());
}
#[test]
fn test_ascii85_decode_invalid() {
let data = b"invalid~>";
let result = decode_ascii85(data);
assert!(result.is_err());
}
#[cfg(feature = "compression")]
#[test]
fn test_flate_decode() {
use flate2::write::ZlibEncoder;
use flate2::Compression;
use std::io::Write;
let original = b"Hello, compressed world!";
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
encoder.write_all(original).unwrap();
let compressed = encoder.finish().unwrap();
let result = decode_flate(&compressed).unwrap();
assert_eq!(result, original);
}
#[cfg(not(feature = "compression"))]
#[test]
fn test_flate_decode_not_supported() {
let data = b"compressed data";
let result = decode_flate(data);
assert!(result.is_err());
}
#[test]
fn test_apply_filter() {
let data = b"48656C6C6F>";
let result = apply_filter(data, Filter::ASCIIHexDecode).unwrap();
assert_eq!(result, b"Hello");
}
#[test]
fn test_apply_filter_unsupported() {
let data = b"test data";
let unsupported_filters = vec![Filter::JPXDecode, Filter::Crypt];
for filter in unsupported_filters {
let result = apply_filter(data, filter);
assert!(result.is_err());
}
}
#[test]
fn test_apply_filter_dct_decode() {
let invalid_data = b"not jpeg data";
let result = apply_filter(invalid_data, Filter::DCTDecode);
assert!(result.is_err());
let valid_jpeg = vec![
0xFF, 0xD8, 0xFF, 0xD9, ];
let result = apply_filter(&valid_jpeg, Filter::DCTDecode);
assert!(result.is_ok());
assert_eq!(result.unwrap(), valid_jpeg); }
#[test]
fn test_apply_filter_with_params_no_predictor() {
let data = b"48656C6C6F>";
let dict = PdfDictionary::new();
let result = apply_filter_with_params(data, Filter::ASCIIHexDecode, Some(&dict)).unwrap();
assert_eq!(result, b"Hello");
}
#[test]
fn test_apply_predictor_none() {
let data = vec![1, 2, 3, 4];
let dict = PdfDictionary::new();
let result = apply_predictor(&data, 1, &dict).unwrap();
assert_eq!(result, data);
}
#[test]
fn test_apply_predictor_unknown() {
let data = vec![1, 2, 3, 4];
let dict = PdfDictionary::new();
let result = apply_predictor(&data, 99, &dict).unwrap();
assert_eq!(result, data);
}
#[test]
fn test_png_predictor_sub_filter() {
let data = vec![1, 5, 10]; let result = apply_png_sub_filter(&data, 1);
assert_eq!(result, vec![1, 6, 16]); }
#[test]
fn test_png_predictor_up_filter() {
let data = vec![1, 2, 3];
let prev_row = vec![5, 10, 15];
let result = apply_png_up_filter(&data, Some(&prev_row));
assert_eq!(result, vec![6, 12, 18]); }
#[test]
fn test_png_predictor_up_filter_no_prev() {
let data = vec![1, 2, 3];
let result = apply_png_up_filter(&data, None);
assert_eq!(result, vec![1, 2, 3]); }
#[test]
fn test_png_predictor_average_filter() {
let data = vec![2, 4]; let prev_row = vec![6, 8];
let result = apply_png_average_filter(&data, Some(&prev_row), 1);
assert_eq!(result, vec![5, 10]);
}
#[test]
fn test_png_predictor_paeth_filter() {
let data = vec![1, 2]; let prev_row = vec![3, 4];
let result = apply_png_paeth_filter(&data, Some(&prev_row), 1);
assert_eq!(result.len(), 2);
}
#[test]
fn test_paeth_predictor_algorithm() {
assert_eq!(paeth_predictor(1, 2, 0), 2);
assert_eq!(paeth_predictor(5, 2, 3), 5);
assert_eq!(paeth_predictor(5, 8, 3), 8);
}
#[test]
fn test_apply_png_predictor_invalid_data() {
let mut params = PdfDictionary::new();
params.insert("Columns".to_string(), PdfObject::Integer(3));
let data = vec![0, 1, 2, 3, 4, 5]; let result = apply_png_predictor_with_width(&data, 10, 3);
assert!(result.is_err());
}
#[test]
fn test_apply_png_predictor_valid_simple() {
let mut params = PdfDictionary::new();
params.insert("Columns".to_string(), PdfObject::Integer(2));
params.insert("BitsPerComponent".to_string(), PdfObject::Integer(8));
params.insert("Colors".to_string(), PdfObject::Integer(1));
let data = vec![
0, 1, 2, 0, 3, 4, ];
let result = apply_png_predictor_with_width(&data, 10, 2).unwrap();
assert_eq!(result, vec![1, 2, 3, 4]);
}
#[test]
fn test_apply_png_predictor_with_sub_filter() {
let mut params = PdfDictionary::new();
params.insert("Columns".to_string(), PdfObject::Integer(3));
params.insert("BitsPerComponent".to_string(), PdfObject::Integer(8));
params.insert("Colors".to_string(), PdfObject::Integer(1));
let data = vec![
1, 1, 2, 3, ];
let result = apply_png_predictor_with_width(&data, 10, 3).unwrap();
assert_eq!(result, vec![1, 2, 3]); }
#[test]
fn test_apply_png_predictor_invalid_filter_type() {
let mut params = PdfDictionary::new();
params.insert("Columns".to_string(), PdfObject::Integer(2));
let data = vec![5, 1, 2];
let result = apply_png_predictor_with_width(&data, 10, 2);
if result.is_err() {
let error_msg = result.unwrap_err().to_string();
assert!(
error_msg.contains("filter")
|| error_msg.contains("predictor")
|| error_msg.contains("Invalid")
);
} else {
let _decoded_data = result.unwrap();
}
}
#[test]
fn test_get_filter_params_dict() {
let mut dict = PdfDictionary::new();
dict.insert("Predictor".to_string(), PdfObject::Integer(12));
let obj = PdfObject::Dictionary(dict);
let result = get_filter_params(Some(&obj), 0);
assert!(result.is_some());
assert_eq!(
result.unwrap().get("Predictor"),
Some(&PdfObject::Integer(12))
);
}
#[test]
fn test_get_filter_params_array() {
let mut inner_dict = PdfDictionary::new();
inner_dict.insert("Predictor".to_string(), PdfObject::Integer(15));
let array = vec![PdfObject::Dictionary(inner_dict)];
let obj = PdfObject::Array(crate::parser::objects::PdfArray(array));
let result = get_filter_params(Some(&obj), 0);
assert!(result.is_some());
assert_eq!(
result.unwrap().get("Predictor"),
Some(&PdfObject::Integer(15))
);
}
#[test]
fn test_get_filter_params_none() {
let result = get_filter_params(None, 0);
assert!(result.is_none());
}
#[test]
fn test_compressed_xref_integration() {
use flate2::write::ZlibEncoder;
use flate2::Compression;
use std::io::Write;
#[cfg(feature = "compression")]
{
let original_data = vec![
0, 1, 2, 0, 3, 4, ];
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
encoder.write_all(&original_data).unwrap();
let compressed = encoder.finish().unwrap();
let mut decode_params = PdfDictionary::new();
decode_params.insert("Predictor".to_string(), PdfObject::Integer(12)); decode_params.insert("Columns".to_string(), PdfObject::Integer(2));
decode_params.insert("BitsPerComponent".to_string(), PdfObject::Integer(8));
decode_params.insert("Colors".to_string(), PdfObject::Integer(1));
let result =
apply_filter_with_params(&compressed, Filter::FlateDecode, Some(&decode_params))
.unwrap();
assert_eq!(result, vec![1, 2, 3, 4]);
}
}
fn encode_lzw_test_data(codes: &[u16]) -> Vec<u8> {
let mut result = Vec::new();
let mut bit_buffer = 0u32;
let mut bits_in_buffer = 0;
let mut code_size = 9;
for &code in codes {
bit_buffer = (bit_buffer << code_size) | (code as u32);
bits_in_buffer += code_size;
while bits_in_buffer >= 8 {
let byte = ((bit_buffer >> (bits_in_buffer - 8)) & 0xFF) as u8;
result.push(byte);
bits_in_buffer -= 8;
}
if code == 511 && code_size == 9 {
code_size = 10;
} else if code == 1023 && code_size == 10 {
code_size = 11;
} else if code == 2047 && code_size == 11 {
code_size = 12;
}
}
if bits_in_buffer > 0 {
let byte = ((bit_buffer << (8 - bits_in_buffer)) & 0xFF) as u8;
result.push(byte);
}
result
}
#[test]
fn test_lzw_decode_simple() {
let codes = vec![65, 66, 67, 257];
let data = encode_lzw_test_data(&codes);
let result = decode_lzw(&data, None).unwrap();
assert_eq!(result, b"ABC");
}
#[test]
fn test_lzw_decode_with_repetition() {
let codes = vec![65, 65, 258, 257];
let data = encode_lzw_test_data(&codes);
let result = decode_lzw(&data, None).unwrap();
assert_eq!(result, b"AAAA");
}
#[test]
fn test_lzw_decode_clear_code() {
let codes = vec![65, 66, 256, 67, 68, 257];
let data = encode_lzw_test_data(&codes);
let result = decode_lzw(&data, None).unwrap();
assert_eq!(result, b"ABCD");
}
#[test]
fn test_lzw_decode_growing_codes() {
let mut params = PdfDictionary::new();
params.insert("EarlyChange".to_string(), PdfObject::Integer(1));
let data = vec![0x08, 0x21, 0x08, 0x61, 0x08, 0x20, 0x80];
let result = decode_lzw(&data, Some(¶ms));
assert!(result.is_ok());
}
#[test]
fn test_lzw_decode_early_change_false() {
let mut params = PdfDictionary::new();
params.insert("EarlyChange".to_string(), PdfObject::Integer(0));
let codes = vec![65, 66, 67, 257];
let data = encode_lzw_test_data(&codes);
let result = decode_lzw(&data, Some(¶ms)).unwrap();
assert_eq!(result, b"ABC");
}
#[test]
fn test_lzw_decode_invalid_code() {
let data = vec![0x08, 0x21, 0xFF, 0xFF, 0x00];
let result = decode_lzw(&data, None);
assert!(result.is_err());
}
#[test]
fn test_lzw_decode_empty() {
let codes = vec![257];
let data = encode_lzw_test_data(&codes);
let result = decode_lzw(&data, None).unwrap();
assert!(result.is_empty());
}
#[test]
fn test_lzw_bit_reader() {
let data = vec![0b10101010, 0b11001100, 0b11110000];
let mut reader = LzwBitReader::new(&data);
assert_eq!(reader.read_bits(4), Some(0b1010));
assert_eq!(reader.read_bits(8), Some(0b10101100));
assert_eq!(reader.read_bits(6), Some(0b110011));
assert_eq!(reader.read_bits(6), Some(0b110000));
assert_eq!(reader.read_bits(8), None);
}
#[test]
fn test_lzw_bit_reader_edge_cases() {
let data = vec![0xFF];
let mut reader = LzwBitReader::new(&data);
assert_eq!(reader.read_bits(0), None);
assert_eq!(reader.read_bits(17), None);
assert_eq!(reader.read_bits(8), Some(0xFF));
assert_eq!(reader.read_bits(1), None);
}
#[test]
fn test_apply_filter_lzw() {
let codes = vec![65, 66, 67, 257];
let data = encode_lzw_test_data(&codes);
let result = apply_filter(&data, Filter::LZWDecode).unwrap();
assert_eq!(result, b"ABC");
}
#[test]
fn test_apply_filter_with_params_lzw() {
let mut params = PdfDictionary::new();
params.insert("EarlyChange".to_string(), PdfObject::Integer(0));
let codes = vec![65, 66, 67, 257];
let data = encode_lzw_test_data(&codes);
let result = apply_filter_with_params(&data, Filter::LZWDecode, Some(¶ms)).unwrap();
assert_eq!(result, b"ABC");
}
#[test]
fn test_run_length_decode_literal() {
let data = vec![2, b'A', b'B', b'C'];
let result = decode_run_length(&data).unwrap();
assert_eq!(result, b"ABC");
}
#[test]
fn test_run_length_decode_repeat() {
let data = vec![253u8, b'X']; let result = decode_run_length(&data).unwrap();
assert_eq!(result, b"XXXX");
}
#[test]
fn test_run_length_decode_mixed() {
let data = vec![
1, b'A', b'B', 254u8, b'C', 1, b'D', b'E', ];
let result = decode_run_length(&data).unwrap();
assert_eq!(result, b"ABCCCDE");
}
#[test]
fn test_run_length_decode_eod() {
let data = vec![0, b'A', 128u8, 1, b'B', b'C']; let result = decode_run_length(&data).unwrap();
assert_eq!(result, b"A"); }
#[test]
fn test_run_length_decode_empty() {
let data = vec![];
let result = decode_run_length(&data).unwrap();
assert!(result.is_empty());
}
#[test]
fn test_run_length_decode_single_literal() {
let data = vec![0, b'Z'];
let result = decode_run_length(&data).unwrap();
assert_eq!(result, b"Z");
}
#[test]
fn test_run_length_decode_single_repeat() {
let data = vec![255u8, b'Y']; let result = decode_run_length(&data).unwrap();
assert_eq!(result, b"YY");
}
#[test]
fn test_run_length_decode_max_repeat() {
let data = vec![129u8, b'M']; let result = decode_run_length(&data).unwrap();
assert_eq!(result.len(), 128);
assert!(result.iter().all(|&b| b == b'M'));
}
#[test]
fn test_run_length_decode_max_literal() {
let mut data = vec![127];
data.extend((0..128).map(|i| i as u8));
let result = decode_run_length(&data).unwrap();
assert_eq!(result.len(), 128);
assert_eq!(result, (0..128).map(|i| i as u8).collect::<Vec<u8>>());
}
#[test]
fn test_run_length_decode_error_literal_overflow() {
let data = vec![5, b'A', b'B']; let result = decode_run_length(&data);
assert!(result.is_err());
}
#[test]
fn test_run_length_decode_error_missing_repeat_byte() {
let data = vec![254u8]; let result = decode_run_length(&data);
assert!(result.is_err());
}
#[test]
fn test_apply_filter_run_length() {
let data = vec![2, b'X', b'Y', b'Z'];
let result = apply_filter(&data, Filter::RunLengthDecode).unwrap();
assert_eq!(result, b"XYZ");
}
#[test]
fn test_apply_filter_with_params_run_length() {
let data = vec![254u8, b'A', 1, b'B', b'C']; let result = apply_filter_with_params(&data, Filter::RunLengthDecode, None).unwrap();
assert_eq!(result, b"AAABC");
}
#[test]
fn test_read_to_end_limited_within_limit() {
let data = vec![42u8; 1000];
let mut cursor = std::io::Cursor::new(&data);
let result = read_to_end_limited(&mut cursor, 2000).unwrap();
assert_eq!(result.len(), 1000);
}
#[test]
fn test_read_to_end_limited_at_exact_limit() {
let data = vec![42u8; 1000];
let mut cursor = std::io::Cursor::new(&data);
let result = read_to_end_limited(&mut cursor, 1000).unwrap();
assert_eq!(result.len(), 1000);
}
#[test]
fn test_read_to_end_limited_exceeds_limit() {
let data = vec![42u8; 2000];
let mut cursor = std::io::Cursor::new(&data);
let result = read_to_end_limited(&mut cursor, 1000);
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
err.to_string().contains("exceeds limit"),
"Expected decompression limit error, got: {}",
err
);
}
#[test]
fn test_check_compression_ratio_normal() {
assert!(check_compression_ratio(100, 1000).is_ok());
}
#[test]
fn test_check_compression_ratio_high() {
assert!(check_compression_ratio(1, 1001).is_err());
}
#[test]
fn test_check_compression_ratio_zero_input() {
assert!(check_compression_ratio(0, 1000).is_ok());
}
#[cfg(feature = "compression")]
#[test]
fn test_flate_normal_data_succeeds() {
use flate2::write::ZlibEncoder;
use flate2::Compression;
use std::io::Write;
let mut original = Vec::with_capacity(100_000);
for i in 0..100_000u32 {
original.push((i % 256) as u8);
}
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::default());
encoder.write_all(&original).unwrap();
let compressed = encoder.finish().unwrap();
let result = try_standard_zlib_decode(&compressed);
assert!(result.is_ok());
assert_eq!(result.unwrap().len(), 100_000);
}
#[cfg(feature = "compression")]
#[test]
fn test_flate_high_ratio_rejected() {
use flate2::write::ZlibEncoder;
use flate2::Compression;
use std::io::Write;
let original = vec![0u8; 2 * 1024 * 1024];
let mut encoder = ZlibEncoder::new(Vec::new(), Compression::best());
encoder.write_all(&original).unwrap();
let compressed = encoder.finish().unwrap();
let result = try_standard_zlib_decode(&compressed);
assert!(result.is_err(), "High compression ratio should be rejected");
let err = result.unwrap_err();
assert!(
err.to_string().contains("compression ratio")
|| err.to_string().contains("exceeds limit"),
"Expected compression ratio error, got: {}",
err
);
}
#[cfg(feature = "compression")]
#[test]
fn test_flate_compression_ratio_check() {
let result = check_compression_ratio(10, 10_010);
assert!(result.is_err());
let err = result.unwrap_err();
assert!(err.to_string().contains("Suspicious compression ratio"));
}
#[test]
fn test_read_to_end_limited_empty_input() {
let data: Vec<u8> = Vec::new();
let mut cursor = std::io::Cursor::new(&data);
let result = read_to_end_limited(&mut cursor, 1000).unwrap();
assert!(result.is_empty());
}
}
pub(crate) fn apply_filter_with_params(
data: &[u8],
filter: Filter,
params: Option<&PdfDictionary>,
) -> ParseResult<Vec<u8>> {
let result = match filter {
Filter::FlateDecode => {
if let Some(decode_params) = params {
if decode_params
.get("Predictor")
.and_then(|p| p.as_integer())
.is_some()
{
match try_standard_zlib_decode(data) {
Ok(decoded) => decoded,
Err(_) => {
data.to_vec()
}
}
} else {
decode_flate(data)?
}
} else {
decode_flate(data)?
}
}
Filter::ASCIIHexDecode => decode_ascii_hex(data)?,
Filter::ASCII85Decode => decode_ascii85(data)?,
Filter::LZWDecode => decode_lzw(data, params)?,
Filter::RunLengthDecode => decode_run_length(data)?,
Filter::CCITTFaxDecode => decode_ccitt(data, params)?,
Filter::JBIG2Decode => decode_jbig2(data, params)?,
Filter::DCTDecode => decode_dct(data)?,
_ => {
return Err(ParseError::SyntaxError {
position: 0,
message: format!("Filter {filter:?} not yet implemented"),
});
}
};
if let Some(params_dict) = params {
if let Some(predictor_obj) = params_dict.get("Predictor") {
if let Some(predictor) = predictor_obj.as_integer() {
match apply_predictor(&result, predictor as u32, params_dict) {
Ok(predictor_result) => return Ok(predictor_result),
Err(_) => {
return Ok(result);
}
}
}
}
}
Ok(result)
}
fn get_filter_params(decode_params: Option<&PdfObject>, _index: usize) -> Option<&PdfDictionary> {
match decode_params {
Some(PdfObject::Dictionary(dict)) => Some(dict),
Some(PdfObject::Array(array)) => {
array.0.first().and_then(|obj| obj.as_dict())
}
_ => None,
}
}
fn apply_predictor(data: &[u8], predictor: u32, params: &PdfDictionary) -> ParseResult<Vec<u8>> {
match predictor {
1 => {
Ok(data.to_vec())
}
10..=15 => {
apply_png_predictor_advanced(data, predictor, params)
}
_ => {
#[cfg(debug_assertions)]
tracing::debug!("Warning: Unknown predictor {predictor}, returning data as-is");
Ok(data.to_vec())
}
}
}
fn apply_png_predictor_advanced(
data: &[u8],
_predictor: u32,
params: &PdfDictionary,
) -> ParseResult<Vec<u8>> {
let columns = params
.get("Columns")
.and_then(|obj| obj.as_integer())
.unwrap_or(1) as usize;
let bpc = params
.get("BitsPerComponent")
.and_then(|obj| obj.as_integer())
.unwrap_or(8) as usize;
let colors = params
.get("Colors")
.and_then(|obj| obj.as_integer())
.unwrap_or(1) as usize;
let bytes_per_pixel = (bpc * colors).div_ceil(8);
let row_size = columns + 1;
if data.len() % row_size != 0 {
return Err(ParseError::StreamDecodeError(
"PNG predictor: data length not multiple of row size".to_string(),
));
}
let num_rows = data.len() / row_size;
let mut result = Vec::with_capacity(columns * num_rows);
for row in 0..num_rows {
let row_start = row * row_size;
let predictor_byte = data[row_start];
let row_data = &data[row_start + 1..row_start + row_size];
let filtered_row = match predictor_byte {
0 => {
row_data.to_vec()
}
1 => {
apply_png_sub_filter(row_data, bytes_per_pixel)
}
2 => {
let prev_row = if row > 0 {
Some(&result[(row - 1) * columns..row * columns])
} else {
None
};
apply_png_up_filter(row_data, prev_row)
}
3 => {
let prev_row = if row > 0 {
Some(&result[(row - 1) * columns..row * columns])
} else {
None
};
apply_png_average_filter(row_data, prev_row, bytes_per_pixel)
}
4 => {
let prev_row = if row > 0 {
Some(&result[(row - 1) * columns..row * columns])
} else {
None
};
apply_png_paeth_filter(row_data, prev_row, bytes_per_pixel)
}
_ => {
return Err(ParseError::StreamDecodeError(format!(
"PNG predictor: unknown filter type {predictor_byte}"
)));
}
};
result.extend_from_slice(&filtered_row);
}
Ok(result)
}
fn apply_png_sub_filter(data: &[u8], bytes_per_pixel: usize) -> Vec<u8> {
let mut result = Vec::with_capacity(data.len());
for (i, &byte) in data.iter().enumerate() {
if i < bytes_per_pixel {
result.push(byte);
} else {
result.push(byte.wrapping_add(result[i - bytes_per_pixel]));
}
}
result
}
fn apply_png_up_filter(data: &[u8], prev_row: Option<&[u8]>) -> Vec<u8> {
let mut result = Vec::with_capacity(data.len());
for (i, &byte) in data.iter().enumerate() {
let up_byte = prev_row.and_then(|row| row.get(i)).unwrap_or(&0);
result.push(byte.wrapping_add(*up_byte));
}
result
}
fn apply_png_average_filter(
data: &[u8],
prev_row: Option<&[u8]>,
bytes_per_pixel: usize,
) -> Vec<u8> {
let mut result = Vec::with_capacity(data.len());
for (i, &byte) in data.iter().enumerate() {
let left_byte = if i < bytes_per_pixel {
0
} else {
result[i - bytes_per_pixel]
};
let up_byte = prev_row.and_then(|row| row.get(i)).unwrap_or(&0);
let average = ((left_byte as u16 + *up_byte as u16) / 2) as u8;
result.push(byte.wrapping_add(average));
}
result
}
fn apply_png_paeth_filter(data: &[u8], prev_row: Option<&[u8]>, bytes_per_pixel: usize) -> Vec<u8> {
let mut result = Vec::with_capacity(data.len());
for (i, &byte) in data.iter().enumerate() {
let left_byte = if i < bytes_per_pixel {
0
} else {
result[i - bytes_per_pixel]
};
let up_byte = prev_row.and_then(|row| row.get(i)).unwrap_or(&0);
let up_left_byte = if i < bytes_per_pixel {
0
} else {
*prev_row
.and_then(|row| row.get(i - bytes_per_pixel))
.unwrap_or(&0)
};
let paeth = paeth_predictor(left_byte, *up_byte, up_left_byte);
result.push(byte.wrapping_add(paeth));
}
result
}
fn paeth_predictor(left: u8, up: u8, up_left: u8) -> u8 {
let p = left as i16 + up as i16 - up_left as i16;
let pa = (p - left as i16).abs();
let pb = (p - up as i16).abs();
let pc = (p - up_left as i16).abs();
if pa <= pb && pa <= pc {
left
} else if pb <= pc {
up
} else {
up_left
}
}
fn decode_lzw(data: &[u8], params: Option<&PdfDictionary>) -> ParseResult<Vec<u8>> {
let early_change = params
.and_then(|p| p.get("EarlyChange"))
.and_then(|v| v.as_integer())
.map(|v| v != 0)
.unwrap_or(true);
const MIN_BITS: u32 = 9;
const MAX_BITS: u32 = 12;
const CLEAR_CODE: u16 = 256;
const EOD_CODE: u16 = 257;
#[allow(dead_code)]
const FIRST_CODE: u16 = 258;
let mut dictionary: Vec<Vec<u8>> = Vec::with_capacity(4096);
for i in 0..=255 {
dictionary.push(vec![i]);
}
dictionary.push(vec![]); dictionary.push(vec![]);
let mut result = Vec::new();
let mut bit_reader = LzwBitReader::new(data);
let mut code_size = MIN_BITS;
let mut prev_code: Option<u16> = None;
while let Some(c) = bit_reader.read_bits(code_size) {
let code = c as u16;
if code == EOD_CODE {
break;
}
if code == CLEAR_CODE {
dictionary.truncate(258);
code_size = MIN_BITS;
prev_code = None;
continue;
}
if let Some(prev) = prev_code {
let string = if (code as usize) < dictionary.len() {
dictionary[code as usize].clone()
} else if code as usize == dictionary.len() {
let mut s = dictionary[prev as usize].clone();
s.push(dictionary[prev as usize][0]);
s
} else {
return Err(ParseError::StreamDecodeError(format!(
"LZW decode error: invalid code {code}"
)));
};
result.extend_from_slice(&string);
if result.len() > MAX_DECOMPRESSED_SIZE {
return Err(ParseError::StreamDecodeError(format!(
"LZW decompressed size exceeds {} MB limit",
MAX_DECOMPRESSED_SIZE / (1024 * 1024)
)));
}
if dictionary.len() < 4096 {
let mut new_entry = dictionary[prev as usize].clone();
new_entry.push(string[0]);
dictionary.push(new_entry);
let dict_size = dictionary.len();
let threshold = if early_change {
1 << code_size
} else {
(1 << code_size) + 1
};
if dict_size >= threshold as usize && code_size < MAX_BITS {
code_size += 1;
}
}
} else {
if (code as usize) < dictionary.len() {
result.extend_from_slice(&dictionary[code as usize]);
} else {
return Err(ParseError::StreamDecodeError(format!(
"LZW decode error: invalid first code {code}"
)));
}
}
prev_code = Some(code);
}
Ok(result)
}
struct LzwBitReader<'a> {
data: &'a [u8],
byte_pos: usize,
bit_pos: u8,
}
impl<'a> LzwBitReader<'a> {
fn new(data: &'a [u8]) -> Self {
Self {
data,
byte_pos: 0,
bit_pos: 0,
}
}
fn read_bits(&mut self, n: u32) -> Option<u32> {
if n == 0 || n > 16 {
return None;
}
let mut result = 0u32;
let mut bits_read = 0;
while bits_read < n {
if self.byte_pos >= self.data.len() {
return None;
}
let bits_available = 8 - self.bit_pos;
let bits_to_read = (n - bits_read).min(bits_available as u32);
let mask = ((1u32 << bits_to_read) - 1) as u8;
let shift = bits_available - bits_to_read as u8;
let bits = (self.data[self.byte_pos] >> shift) & mask;
result = (result << bits_to_read) | (bits as u32);
bits_read += bits_to_read;
self.bit_pos += bits_to_read as u8;
if self.bit_pos >= 8 {
self.bit_pos = 0;
self.byte_pos += 1;
}
}
Some(result)
}
}
fn decode_run_length(data: &[u8]) -> ParseResult<Vec<u8>> {
let mut result = Vec::new();
let mut i = 0;
while i < data.len() {
let length = data[i] as i8;
i += 1;
if length == -128 {
break;
} else if length >= 0 {
let count = (length as usize) + 1;
if i + count > data.len() {
return Err(ParseError::StreamDecodeError(
"RunLength decode error: insufficient data for literal copy".to_string(),
));
}
result.extend_from_slice(&data[i..i + count]);
i += count;
} else {
if i >= data.len() {
return Err(ParseError::StreamDecodeError(
"RunLength decode error: missing byte to repeat".to_string(),
));
}
let repeat_byte = data[i];
let count = ((-length) as usize) + 1;
for _ in 0..count {
result.push(repeat_byte);
}
i += 1;
}
if result.len() > MAX_DECOMPRESSED_SIZE {
return Err(ParseError::StreamDecodeError(format!(
"RunLength decompressed size exceeds {} MB limit",
MAX_DECOMPRESSED_SIZE / (1024 * 1024)
)));
}
}
Ok(result)
}