use super::header::PdfHeader;
use super::object_stream::ObjectStream;
use super::objects::{PdfDictionary, PdfObject};
use super::stack_safe::StackSafeContext;
use super::trailer::PdfTrailer;
use super::xref::XRefTable;
use super::{ParseError, ParseOptions, ParseResult};
use crate::memory::{LruCache, MemoryOptions, MemoryStats};
use crate::objects::ObjectId;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, Read, Seek, SeekFrom};
use std::path::Path;
use std::sync::Arc;
pub struct OptimizedPdfReader<R: Read + Seek> {
reader: BufReader<R>,
header: PdfHeader,
xref: XRefTable,
trailer: PdfTrailer,
object_cache: LruCache<ObjectId, Arc<PdfObject>>,
object_stream_cache: HashMap<u32, ObjectStream>,
#[allow(dead_code)]
page_tree: Option<super::page_tree::PageTree>,
#[allow(dead_code)]
parse_context: StackSafeContext,
options: super::ParseOptions,
#[allow(dead_code)]
memory_options: MemoryOptions,
memory_stats: MemoryStats,
}
impl<R: Read + Seek> OptimizedPdfReader<R> {
pub fn options(&self) -> &super::ParseOptions {
&self.options
}
pub fn memory_stats(&self) -> &MemoryStats {
&self.memory_stats
}
pub fn clear_cache(&mut self) {
self.object_cache.clear();
self.object_stream_cache.clear();
self.memory_stats.cached_objects = 0;
}
}
impl OptimizedPdfReader<File> {
pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
let file = File::open(path)?;
let options = super::ParseOptions::lenient();
let memory_options = MemoryOptions::default();
Self::new_with_options(file, options, memory_options)
}
pub fn open_with_memory<P: AsRef<Path>>(
path: P,
memory_options: MemoryOptions,
) -> ParseResult<Self> {
let file = File::open(path)?;
let options = super::ParseOptions::lenient();
Self::new_with_options(file, options, memory_options)
}
pub fn open_strict<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
let file = File::open(path)?;
let options = super::ParseOptions::strict();
let memory_options = MemoryOptions::default();
Self::new_with_options(file, options, memory_options)
}
}
impl<R: Read + Seek> OptimizedPdfReader<R> {
pub fn new(reader: R) -> ParseResult<Self> {
Self::new_with_options(
reader,
super::ParseOptions::default(),
MemoryOptions::default(),
)
}
pub fn new_with_options(
reader: R,
options: super::ParseOptions,
memory_options: MemoryOptions,
) -> ParseResult<Self> {
let mut buf_reader = BufReader::new(reader);
let start_pos = buf_reader.stream_position()?;
buf_reader.seek(SeekFrom::End(0))?;
let file_size = buf_reader.stream_position()?;
buf_reader.seek(SeekFrom::Start(start_pos))?;
if file_size == 0 {
return Err(ParseError::EmptyFile);
}
let header = PdfHeader::parse(&mut buf_reader)?;
let xref = XRefTable::parse_with_options(&mut buf_reader, &options)?;
let trailer_dict = xref.trailer().ok_or(ParseError::InvalidTrailer)?.clone();
let xref_offset = xref.xref_offset();
let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
trailer.validate()?;
let cache_size = memory_options.cache_size.max(1);
let object_cache = LruCache::new(cache_size);
Ok(Self {
reader: buf_reader,
header,
xref,
trailer,
object_cache,
object_stream_cache: HashMap::new(),
page_tree: None,
parse_context: StackSafeContext::new(),
options,
memory_options,
memory_stats: MemoryStats::default(),
})
}
pub fn version(&self) -> &super::header::PdfVersion {
&self.header.version
}
pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
let (obj_num, gen_num) = match self.trailer.root() {
Ok(root) => root,
Err(_) => {
#[cfg(debug_assertions)]
tracing::debug!("Warning: Trailer missing Root entry, attempting recovery");
if let Some(root) = self.trailer.find_root_fallback() {
root
} else {
if let Ok(catalog_ref) = self.find_catalog_object() {
catalog_ref
} else {
return Err(ParseError::MissingKey("Root".to_string()));
}
}
}
};
let catalog = self.get_object(obj_num, gen_num)?;
catalog.as_dict().ok_or_else(|| ParseError::SyntaxError {
position: 0,
message: "Catalog is not a dictionary".to_string(),
})
}
pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
match self.trailer.info() {
Some((obj_num, gen_num)) => {
let info = self.get_object(obj_num, gen_num)?;
Ok(info.as_dict())
}
None => Ok(None),
}
}
pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
let object_id = ObjectId::new(obj_num, gen_num);
if let Some(cached_obj) = self.object_cache.get(&object_id) {
self.memory_stats.cache_hits += 1;
let ptr = Arc::as_ptr(cached_obj);
return Ok(unsafe { &*ptr });
}
self.memory_stats.cache_misses += 1;
let obj = self.load_object_from_disk(obj_num, gen_num)?;
let arc_obj = Arc::new(obj);
self.object_cache.put(object_id, arc_obj);
self.memory_stats.cached_objects = self.object_cache.len();
self.object_cache
.get(&object_id)
.map(|arc| unsafe { &*Arc::as_ptr(arc) })
.ok_or(ParseError::SyntaxError {
position: 0,
message: "Object not in cache after insertion".to_string(),
})
}
fn load_object_from_disk(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<PdfObject> {
if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
return self.get_compressed_object_direct(
obj_num,
gen_num,
stream_obj_num,
index_in_stream,
);
}
}
let entry = self
.xref
.get_entry(obj_num)
.ok_or(ParseError::InvalidReference(obj_num, gen_num))?;
if !entry.in_use {
return Ok(PdfObject::Null);
}
if entry.generation != gen_num {
return Err(ParseError::InvalidReference(obj_num, gen_num));
}
self.reader.seek(std::io::SeekFrom::Start(entry.offset))?;
let mut lexer =
super::lexer::Lexer::new_with_options(&mut self.reader, self.options.clone());
let token = lexer.next_token()?;
let read_obj_num = match token {
super::lexer::Token::Integer(n) => n as u32,
_ => {
if self.options.lenient_syntax {
if self.options.collect_warnings {
tracing::debug!(
"Warning: Using expected object number {obj_num} instead of parsed token"
);
}
obj_num
} else {
return Err(ParseError::SyntaxError {
position: entry.offset as usize,
message: "Expected object number".to_string(),
});
}
}
};
if read_obj_num != obj_num && !self.options.lenient_syntax {
return Err(ParseError::SyntaxError {
position: entry.offset as usize,
message: format!(
"Object number mismatch: expected {obj_num}, found {read_obj_num}"
),
});
}
let token = lexer.next_token()?;
let read_gen_num = match token {
super::lexer::Token::Integer(n) => n as u16,
_ => {
if self.options.lenient_syntax {
if self.options.collect_warnings {
tracing::debug!(
"Warning: Using generation 0 instead of parsed token for object {obj_num}"
);
}
0
} else {
return Err(ParseError::SyntaxError {
position: entry.offset as usize,
message: "Expected generation number".to_string(),
});
}
}
};
if read_gen_num != gen_num && !self.options.lenient_syntax {
return Err(ParseError::SyntaxError {
position: entry.offset as usize,
message: format!(
"Generation number mismatch: expected {gen_num}, found {read_gen_num}"
),
});
}
let token = lexer.next_token()?;
match token {
super::lexer::Token::Obj => {}
_ => {
if self.options.lenient_syntax {
if self.options.collect_warnings {
tracing::debug!("Warning: Missing 'obj' keyword for object {obj_num}");
}
} else {
return Err(ParseError::SyntaxError {
position: entry.offset as usize,
message: "Expected 'obj' keyword".to_string(),
});
}
}
}
let object = PdfObject::parse(&mut lexer)?;
if let Ok(token) = lexer.peek_token() {
if let super::lexer::Token::EndObj = token {
let _ = lexer.next_token();
} else if !self.options.lenient_syntax && self.options.collect_warnings {
tracing::debug!("Warning: Missing 'endobj' for object {obj_num}");
}
}
Ok(object)
}
fn get_compressed_object_direct(
&mut self,
obj_num: u32,
_gen_num: u16,
stream_obj_num: u32,
_index_in_stream: u32,
) -> ParseResult<PdfObject> {
if !self.object_stream_cache.contains_key(&stream_obj_num) {
let stream_obj = self.load_object_from_disk(stream_obj_num, 0)?;
if let PdfObject::Stream(stream) = stream_obj {
let obj_stream = ObjectStream::parse(stream, &ParseOptions::default())?;
self.object_stream_cache.insert(stream_obj_num, obj_stream);
} else {
return Err(ParseError::SyntaxError {
position: 0,
message: "Object stream is not a stream object".to_string(),
});
}
}
let obj_stream = self
.object_stream_cache
.get(&stream_obj_num)
.ok_or_else(|| ParseError::SyntaxError {
position: 0,
message: "Object stream not found in cache".to_string(),
})?;
obj_stream
.get_object(obj_num)
.cloned()
.ok_or(ParseError::InvalidReference(obj_num, 0))
}
fn find_catalog_object(&mut self) -> ParseResult<(u32, u16)> {
for obj_num in 1..100 {
if let Ok(PdfObject::Dictionary(dict)) = self.get_object(obj_num, 0) {
if let Some(PdfObject::Name(type_name)) = dict.get("Type") {
if type_name.0.as_bytes() == b"Catalog" {
return Ok((obj_num, 0));
}
}
}
}
Err(ParseError::MissingKey("Catalog".to_string()))
}
pub fn reader(&mut self) -> &mut BufReader<R> {
&mut self.reader
}
}
pub fn estimate_object_size(obj: &PdfObject) -> usize {
match obj {
PdfObject::Null => 8,
PdfObject::Boolean(_) => 16,
PdfObject::Integer(_) => 16,
PdfObject::Real(_) => 16,
PdfObject::String(s) => 24 + s.as_bytes().len(),
PdfObject::Name(n) => 24 + n.0.len(),
PdfObject::Array(arr) => {
24 + arr.len() * 8 + arr.0.iter().map(estimate_object_size).sum::<usize>()
}
PdfObject::Dictionary(dict) => {
24 + dict.0.len() * 16
+ dict
.0
.iter()
.map(|(k, v)| k.0.len() + estimate_object_size(v))
.sum::<usize>()
}
PdfObject::Stream(s) => {
48 + s.data.len() + estimate_object_size(&PdfObject::Dictionary(s.dict.clone()))
}
PdfObject::Reference(_, _) => 16,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::objects::{PdfArray, PdfDictionary, PdfName, PdfStream, PdfString};
use std::io::Cursor;
fn create_minimal_pdf() -> Vec<u8> {
b"%PDF-1.4\n\
1 0 obj\n\
<< /Type /Catalog /Pages 2 0 R >>\n\
endobj\n\
2 0 obj\n\
<< /Type /Pages /Kids [3 0 R] /Count 1 >>\n\
endobj\n\
3 0 obj\n\
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\n\
endobj\n\
xref\n\
0 4\n\
0000000000 65535 f \n\
0000000009 00000 n \n\
0000000058 00000 n \n\
0000000115 00000 n \n\
trailer\n\
<< /Size 4 /Root 1 0 R >>\n\
startxref\n\
186\n\
%%EOF\n"
.to_vec()
}
fn create_empty_pdf() -> Vec<u8> {
Vec::new()
}
fn create_invalid_pdf() -> Vec<u8> {
b"Not a PDF file".to_vec()
}
#[test]
fn test_memory_options_integration() {
let options = MemoryOptions::default().with_cache_size(100);
assert_eq!(options.cache_size, 100);
let options = MemoryOptions::default().with_cache_size(0);
assert_eq!(options.cache_size, 0);
}
#[test]
fn test_object_size_estimation_basic_types() {
let obj = PdfObject::Null;
assert_eq!(estimate_object_size(&obj), 8);
let obj = PdfObject::Boolean(true);
assert_eq!(estimate_object_size(&obj), 16);
let obj = PdfObject::Boolean(false);
assert_eq!(estimate_object_size(&obj), 16);
let obj = PdfObject::Integer(42);
assert_eq!(estimate_object_size(&obj), 16);
let obj = PdfObject::Integer(-1000);
assert_eq!(estimate_object_size(&obj), 16);
let obj = PdfObject::Real(3.14159);
assert_eq!(estimate_object_size(&obj), 16);
let obj = PdfObject::Reference(5, 0);
assert_eq!(estimate_object_size(&obj), 16);
}
#[test]
fn test_object_size_estimation_string_types() {
let obj = PdfObject::String(PdfString::new(b"".to_vec()));
assert_eq!(estimate_object_size(&obj), 24);
let obj = PdfObject::String(PdfString::new(b"Hello".to_vec()));
assert_eq!(estimate_object_size(&obj), 24 + 5);
let long_text = "A".repeat(1000);
let obj = PdfObject::String(PdfString::new(long_text.as_bytes().to_vec()));
assert_eq!(estimate_object_size(&obj), 24 + 1000);
let obj = PdfObject::Name(PdfName::new("Type".to_string()));
assert_eq!(estimate_object_size(&obj), 24 + 4);
let obj = PdfObject::Name(PdfName::new("".to_string()));
assert_eq!(estimate_object_size(&obj), 24);
}
#[test]
fn test_object_size_estimation_array() {
let obj = PdfObject::Array(PdfArray(vec![]));
assert_eq!(estimate_object_size(&obj), 24);
let obj = PdfObject::Array(PdfArray(vec![
PdfObject::Integer(1),
PdfObject::Integer(2),
PdfObject::Integer(3),
]));
assert_eq!(estimate_object_size(&obj), 24 + 3 * 8 + 3 * 16);
let inner_array = PdfObject::Array(PdfArray(vec![
PdfObject::Integer(10),
PdfObject::Integer(20),
]));
let obj = PdfObject::Array(PdfArray(vec![PdfObject::Integer(1), inner_array]));
let expected = 24 + 2 * 8 + 16 + (24 + 2 * 8 + 2 * 16);
assert_eq!(estimate_object_size(&obj), expected);
}
#[test]
fn test_object_size_estimation_dictionary() {
let obj = PdfObject::Dictionary(PdfDictionary::new());
assert_eq!(estimate_object_size(&obj), 24);
let mut dict = PdfDictionary::new();
dict.insert(
"Type".to_string(),
PdfObject::Name(PdfName::new("Catalog".to_string())),
);
dict.insert("Count".to_string(), PdfObject::Integer(5));
let obj = PdfObject::Dictionary(dict);
let expected = 24 + 2 * 16 + (4 + 24 + 7) + (5 + 16);
assert_eq!(estimate_object_size(&obj), expected);
}
#[test]
fn test_object_size_estimation_stream() {
let mut dict = PdfDictionary::new();
dict.insert("Length".to_string(), PdfObject::Integer(10));
let stream = PdfObject::Stream(PdfStream {
dict: dict.clone(),
data: b"Hello Test".to_vec(),
});
let dict_size = estimate_object_size(&PdfObject::Dictionary(dict));
let expected = 48 + 10 + dict_size;
assert_eq!(estimate_object_size(&stream), expected);
}
#[test]
fn test_object_size_estimation_complex_structure() {
let mut inner_dict = PdfDictionary::new();
inner_dict.insert(
"Font".to_string(),
PdfObject::Name(PdfName::new("Helvetica".to_string())),
);
inner_dict.insert("Size".to_string(), PdfObject::Integer(12));
let array = PdfObject::Array(PdfArray(vec![
PdfObject::String(PdfString::new(b"Text content".to_vec())),
PdfObject::Dictionary(inner_dict),
PdfObject::Reference(10, 0),
]));
let mut main_dict = PdfDictionary::new();
main_dict.insert(
"Type".to_string(),
PdfObject::Name(PdfName::new("Page".to_string())),
);
main_dict.insert("Contents".to_string(), array);
let obj = PdfObject::Dictionary(main_dict);
let size = estimate_object_size(&obj);
assert!(size > 100);
assert!(size < 1000);
}
#[test]
fn test_optimized_reader_empty_file() {
let data = create_empty_pdf();
let cursor = Cursor::new(data);
let result = OptimizedPdfReader::new(cursor);
assert!(result.is_err());
if let Err(ParseError::EmptyFile) = result {
} else {
panic!("Expected EmptyFile error");
}
}
#[test]
fn test_optimized_reader_invalid_file() {
let data = create_invalid_pdf();
let cursor = Cursor::new(data);
let result = OptimizedPdfReader::new(cursor);
assert!(result.is_err());
}
#[test]
fn test_optimized_reader_creation_with_options() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let parse_options = ParseOptions {
lenient_syntax: true,
collect_warnings: false,
..Default::default()
};
let memory_options = MemoryOptions::default().with_cache_size(50);
let result = OptimizedPdfReader::new_with_options(cursor, parse_options, memory_options);
if result.is_err() {
return;
}
let reader = result.unwrap();
assert!(reader.options().lenient_syntax);
assert!(!reader.options().collect_warnings);
}
#[test]
fn test_optimized_reader_version_access() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let result = OptimizedPdfReader::new(cursor);
if result.is_err() {
return;
}
let reader = result.unwrap();
let version = reader.version();
assert_eq!(version.major, 1);
assert_eq!(version.minor, 4);
}
#[test]
fn test_memory_options_validation() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let memory_options = MemoryOptions::default().with_cache_size(0);
let parse_options = ParseOptions::default();
let result = OptimizedPdfReader::new_with_options(cursor, parse_options, memory_options);
if result.is_err() {
let memory_opts = MemoryOptions::default().with_cache_size(0);
let cache_size = memory_opts.cache_size.max(1);
assert_eq!(cache_size, 1);
}
}
#[test]
fn test_estimate_object_size_edge_cases() {
let large_array = PdfObject::Array(PdfArray((0..1000).map(PdfObject::Integer).collect()));
let size = estimate_object_size(&large_array);
assert!(size > 16000);
let mut large_dict = PdfDictionary::new();
for i in 0..100 {
large_dict.insert(
format!("Key{i}"),
PdfObject::String(PdfString::new(format!("Value{i}").as_bytes().to_vec())),
);
}
let obj = PdfObject::Dictionary(large_dict);
let size = estimate_object_size(&obj);
assert!(size > 1000);
}
#[test]
fn test_memory_options_default_values() {
let options = MemoryOptions::default();
assert!(options.cache_size > 0);
assert!(options.cache_size < 10000); }
#[test]
fn test_memory_options_builder_pattern() {
let options = MemoryOptions::default().with_cache_size(500);
assert_eq!(options.cache_size, 500);
}
#[test]
fn test_object_size_estimation_consistency() {
let obj1 = PdfObject::String(PdfString::new(b"Test".to_vec()));
let obj2 = PdfObject::String(PdfString::new(b"Test".to_vec()));
assert_eq!(estimate_object_size(&obj1), estimate_object_size(&obj2));
let obj3 = PdfObject::String(PdfString::new(b"Different".to_vec()));
assert_ne!(estimate_object_size(&obj1), estimate_object_size(&obj3));
}
#[test]
fn test_object_size_estimation_zero_values() {
let obj = PdfObject::Integer(0);
assert_eq!(estimate_object_size(&obj), 16);
let obj = PdfObject::Real(0.0);
assert_eq!(estimate_object_size(&obj), 16);
let obj = PdfObject::Reference(0, 0);
assert_eq!(estimate_object_size(&obj), 16);
}
#[test]
fn test_object_size_estimation_negative_values() {
let obj = PdfObject::Integer(-42);
assert_eq!(estimate_object_size(&obj), 16);
let obj = PdfObject::Real(-3.14159);
assert_eq!(estimate_object_size(&obj), 16);
}
#[test]
fn test_object_size_estimation_unicode_strings() {
let unicode_text = "Hello 世界 🌍";
let obj = PdfObject::String(PdfString::new(unicode_text.as_bytes().to_vec()));
let expected_size = 24 + unicode_text.len();
assert_eq!(estimate_object_size(&obj), expected_size);
}
#[test]
fn test_object_size_estimation_mixed_array() {
let obj = PdfObject::Array(PdfArray(vec![
PdfObject::Null,
PdfObject::Boolean(true),
PdfObject::Integer(42),
PdfObject::Real(3.14),
PdfObject::String(PdfString::new(b"test".to_vec())),
PdfObject::Name(PdfName::new("Name".to_string())),
PdfObject::Reference(1, 0),
]));
let expected = 24 + 7 * 8 + 8 + 16 + 16 + 16 + (24 + 4) + (24 + 4) + 16;
assert_eq!(estimate_object_size(&obj), expected);
}
#[test]
fn test_find_catalog_object_range() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
if let Ok(mut reader) = OptimizedPdfReader::new(cursor) {
let _result = reader.find_catalog_object();
}
}
#[test]
fn test_memory_stats_tracking() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
if let Ok(reader) = OptimizedPdfReader::new(cursor) {
assert_eq!(reader.memory_stats.cache_hits, 0);
assert_eq!(reader.memory_stats.cache_misses, 0);
assert_eq!(reader.memory_stats.cached_objects, 0);
}
}
mod rigorous {
use super::*;
#[test]
fn test_lru_cache_hit_tracking() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let mut reader =
OptimizedPdfReader::new(cursor).expect("Minimal PDF must parse successfully");
assert_eq!(
reader.memory_stats().cache_hits,
0,
"Cache hits must start at 0"
);
assert_eq!(
reader.memory_stats().cache_misses,
0,
"Cache misses must start at 0"
);
let _ = reader.get_object(1, 0);
assert_eq!(
reader.memory_stats().cache_misses,
1,
"First access must be cache miss"
);
assert_eq!(reader.memory_stats().cache_hits, 0, "No cache hits yet");
let _ = reader.get_object(1, 0);
assert_eq!(
reader.memory_stats().cache_hits,
1,
"Second access must be cache hit"
);
assert_eq!(
reader.memory_stats().cache_misses,
1,
"Cache misses unchanged"
);
let _ = reader.get_object(1, 0);
assert_eq!(
reader.memory_stats().cache_hits,
2,
"Third access must increment cache hits"
);
}
#[test]
fn test_lru_cache_capacity_enforcement() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let memory_options = MemoryOptions::default().with_cache_size(2);
let parse_options = ParseOptions::default();
let mut reader =
OptimizedPdfReader::new_with_options(cursor, parse_options, memory_options)
.expect("Minimal PDF must parse successfully");
let _ = reader.get_object(1, 0);
assert_eq!(
reader.memory_stats().cached_objects,
1,
"Cache should have 1 object"
);
let _ = reader.get_object(2, 0);
assert_eq!(
reader.memory_stats().cached_objects,
2,
"Cache should have 2 objects"
);
let _ = reader.get_object(3, 0);
assert_eq!(
reader.memory_stats().cached_objects,
2,
"Cache must not exceed capacity of 2"
);
}
#[test]
fn test_cache_clear_resets_stats() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let mut reader =
OptimizedPdfReader::new(cursor).expect("Minimal PDF must parse successfully");
let _ = reader.get_object(1, 0);
let _ = reader.get_object(1, 0);
assert!(reader.memory_stats().cache_hits > 0);
assert!(reader.memory_stats().cached_objects > 0);
reader.clear_cache();
assert_eq!(
reader.memory_stats().cached_objects,
0,
"Cache should be empty after clear"
);
let _ = reader.get_object(1, 0);
assert!(
reader.memory_stats().cache_misses >= 2,
"Access after clear must be cache miss"
);
}
#[test]
fn test_empty_file_error_handling() {
let data = create_empty_pdf();
let cursor = Cursor::new(data);
let result = OptimizedPdfReader::new(cursor);
assert!(result.is_err(), "Empty file must return error");
match result {
Err(ParseError::EmptyFile) => {
}
Err(other) => panic!("Expected EmptyFile error, got: {:?}", other),
Ok(_) => panic!("Should not succeed with empty file"),
}
}
#[test]
fn test_invalid_header_error_handling() {
let data = create_invalid_pdf();
let cursor = Cursor::new(data);
let result = OptimizedPdfReader::new(cursor);
assert!(result.is_err(), "Invalid PDF must return error");
}
#[test]
fn test_version_parsing_exact_values() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let reader =
OptimizedPdfReader::new(cursor).expect("Minimal PDF must parse successfully");
let version = reader.version();
assert_eq!(version.major, 1, "PDF major version must be 1");
assert_eq!(version.minor, 4, "PDF minor version must be 4");
}
#[test]
fn test_options_accessibility() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let parse_options = ParseOptions {
lenient_syntax: true,
collect_warnings: false,
..Default::default()
};
let memory_options = MemoryOptions::default().with_cache_size(100);
let reader =
OptimizedPdfReader::new_with_options(cursor, parse_options, memory_options)
.expect("Minimal PDF must parse successfully");
let opts = reader.options();
assert_eq!(
opts.lenient_syntax, true,
"Options must match provided values"
);
assert_eq!(
opts.collect_warnings, false,
"Options must match provided values"
);
}
#[test]
fn test_catalog_access_requires_valid_trailer() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let mut reader =
OptimizedPdfReader::new(cursor).expect("Minimal PDF must parse successfully");
let catalog_result = reader.catalog();
if catalog_result.is_ok() {
let catalog = catalog_result.unwrap();
assert_eq!(
catalog.get("Type"),
Some(&PdfObject::Name(PdfName("Catalog".to_string()))),
"Catalog must have /Type /Catalog"
);
} else {
assert!(matches!(
catalog_result.unwrap_err(),
ParseError::MissingKey(_) | ParseError::SyntaxError { .. }
));
}
}
#[test]
fn test_info_none_when_absent() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let mut reader =
OptimizedPdfReader::new(cursor).expect("Minimal PDF must parse successfully");
let info_result = reader.info();
if info_result.is_ok() {
let info = info_result.unwrap();
assert!(
info.is_none(),
"Info should be None when not present in trailer"
);
}
}
#[test]
fn test_get_object_wrong_generation() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let mut reader =
OptimizedPdfReader::new(cursor).expect("Minimal PDF must parse successfully");
let result = reader.get_object(1, 5);
if result.is_err() {
assert!(matches!(
result.unwrap_err(),
ParseError::InvalidReference(_, _)
));
}
}
#[test]
fn test_get_nonexistent_object() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let mut reader =
OptimizedPdfReader::new(cursor).expect("Minimal PDF must parse successfully");
let result = reader.get_object(9999, 0);
assert!(
result.is_err(),
"Accessing nonexistent object must return error"
);
assert!(matches!(
result.unwrap_err(),
ParseError::InvalidReference(_, _)
));
}
#[test]
fn test_memory_options_min_cache_size() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let memory_options = MemoryOptions::default().with_cache_size(0);
let parse_options = ParseOptions::default();
let mut reader =
OptimizedPdfReader::new_with_options(cursor, parse_options, memory_options)
.expect("Minimal PDF must parse successfully");
let _ = reader.get_object(1, 0);
assert_eq!(
reader.memory_stats().cached_objects,
1,
"Must cache at least 1 object even with cache_size=0"
);
}
#[test]
fn test_estimate_object_size_exact_values() {
assert_eq!(estimate_object_size(&PdfObject::Null), 8);
assert_eq!(estimate_object_size(&PdfObject::Boolean(true)), 16);
assert_eq!(estimate_object_size(&PdfObject::Boolean(false)), 16);
assert_eq!(estimate_object_size(&PdfObject::Integer(0)), 16);
assert_eq!(estimate_object_size(&PdfObject::Integer(42)), 16);
assert_eq!(estimate_object_size(&PdfObject::Integer(-1000)), 16);
assert_eq!(estimate_object_size(&PdfObject::Real(0.0)), 16);
assert_eq!(estimate_object_size(&PdfObject::Real(3.14159)), 16);
assert_eq!(estimate_object_size(&PdfObject::Reference(1, 0)), 16);
assert_eq!(estimate_object_size(&PdfObject::Reference(999, 5)), 16);
}
#[test]
fn test_estimate_string_size_formula() {
let empty = PdfObject::String(PdfString::new(vec![]));
assert_eq!(estimate_object_size(&empty), 24);
let ten_bytes = PdfObject::String(PdfString::new(b"0123456789".to_vec()));
assert_eq!(estimate_object_size(&ten_bytes), 24 + 10);
let hundred_bytes = PdfObject::String(PdfString::new(vec![b'X'; 100]));
assert_eq!(estimate_object_size(&hundred_bytes), 24 + 100);
}
#[test]
fn test_estimate_array_size_formula() {
let empty = PdfObject::Array(PdfArray(vec![]));
assert_eq!(estimate_object_size(&empty), 24);
let three_ints = PdfObject::Array(PdfArray(vec![
PdfObject::Integer(1),
PdfObject::Integer(2),
PdfObject::Integer(3),
]));
assert_eq!(estimate_object_size(&three_ints), 24 + 24 + 48);
}
#[test]
fn test_estimate_dictionary_size_formula() {
let empty = PdfObject::Dictionary(PdfDictionary::new());
assert_eq!(estimate_object_size(&empty), 24);
let mut dict = PdfDictionary::new();
dict.insert(
"Type".to_string(),
PdfObject::Name(PdfName::new("Page".to_string())),
);
let obj = PdfObject::Dictionary(dict);
let expected = 24 + 16 + 4 + (24 + 4); assert_eq!(estimate_object_size(&obj), expected);
}
#[test]
fn test_cache_isolation_between_instances() {
let data = create_minimal_pdf();
let cursor1 = Cursor::new(data.clone());
let cursor2 = Cursor::new(data);
let mut reader1 =
OptimizedPdfReader::new(cursor1).expect("Minimal PDF must parse successfully");
let mut reader2 =
OptimizedPdfReader::new(cursor2).expect("Minimal PDF must parse successfully");
let _ = reader1.get_object(1, 0);
assert_eq!(reader1.memory_stats().cached_objects, 1);
assert_eq!(
reader2.memory_stats().cached_objects,
0,
"Readers must have independent caches"
);
let _ = reader2.get_object(1, 0);
assert_eq!(
reader2.memory_stats().cached_objects,
1,
"reader2 cache should now have 1 object"
);
assert_eq!(
reader1.memory_stats().cached_objects,
1,
"reader1 cache unchanged"
);
}
#[test]
fn test_reader_with_strict_options() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let parse_options = ParseOptions::strict();
let memory_options = MemoryOptions::default();
let reader =
OptimizedPdfReader::new_with_options(cursor, parse_options, memory_options)
.expect("Minimal PDF must parse successfully");
let opts = reader.options();
assert_eq!(
opts.strict_mode, true,
"Strict options must have strict_mode=true"
);
}
#[test]
fn test_reader_with_lenient_options() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let parse_options = ParseOptions::lenient();
let memory_options = MemoryOptions::default();
let reader =
OptimizedPdfReader::new_with_options(cursor, parse_options, memory_options)
.expect("Minimal PDF must parse successfully");
let opts = reader.options();
assert_eq!(
opts.strict_mode, false,
"Lenient options must have strict_mode=false"
);
}
#[test]
fn test_open_from_file_path() {
use std::io::Write;
use tempfile::NamedTempFile;
let mut temp_file = NamedTempFile::new().expect("Failed to create temp file");
temp_file
.write_all(&create_minimal_pdf())
.expect("Failed to write PDF data");
let path = temp_file.path();
let result = OptimizedPdfReader::open(path);
assert!(result.is_ok(), "open() must succeed with valid PDF file");
let reader = result.unwrap();
assert_eq!(
reader.options().strict_mode,
false,
"open() must use lenient parsing"
);
assert_eq!(reader.version().major, 1);
assert_eq!(reader.version().minor, 4);
}
#[test]
fn test_open_with_memory_options() {
use std::io::Write;
use tempfile::NamedTempFile;
let mut temp_file = NamedTempFile::new().expect("Failed to create temp file");
temp_file
.write_all(&create_minimal_pdf())
.expect("Failed to write PDF data");
let path = temp_file.path();
let memory_options = MemoryOptions::default().with_cache_size(10);
let result = OptimizedPdfReader::open_with_memory(path, memory_options);
assert!(result.is_ok(), "open_with_memory() must succeed");
let mut reader = result.unwrap();
assert_eq!(reader.options().strict_mode, false);
let _ = reader.get_object(1, 0);
assert_eq!(
reader.memory_stats().cached_objects,
1,
"Cache should respect custom memory options"
);
}
#[test]
fn test_open_strict_mode() {
use std::io::Write;
use tempfile::NamedTempFile;
let mut temp_file = NamedTempFile::new().expect("Failed to create temp file");
temp_file
.write_all(&create_minimal_pdf())
.expect("Failed to write PDF data");
let path = temp_file.path();
let result = OptimizedPdfReader::open_strict(path);
assert!(result.is_ok(), "open_strict() must succeed with valid PDF");
let reader = result.unwrap();
assert_eq!(
reader.options().strict_mode,
true,
"open_strict() must use strict parsing"
);
assert_eq!(reader.version().major, 1);
assert_eq!(reader.version().minor, 4);
}
#[test]
fn test_open_nonexistent_file() {
use std::path::PathBuf;
let path = PathBuf::from("/tmp/this_file_does_not_exist_xyz_123.pdf");
let result = OptimizedPdfReader::open(&path);
assert!(result.is_err(), "open() must fail with nonexistent file");
match result {
Err(ParseError::Io(_)) => {
}
Err(other) => panic!("Expected IO error, got: {:?}", other),
Ok(_) => panic!("Should not succeed with nonexistent file"),
}
}
#[test]
fn test_load_object_from_disk_free_object() {
let pdf_with_free = b"%PDF-1.4\n\
1 0 obj\n\
<< /Type /Catalog /Pages 2 0 R >>\n\
endobj\n\
2 0 obj\n\
<< /Type /Pages /Kids [3 0 R] /Count 1 >>\n\
endobj\n\
3 0 obj\n\
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\n\
endobj\n\
xref\n\
0 4\n\
0000000000 65535 f \n\
0000000009 00000 n \n\
0000000058 00000 n \n\
0000000115 00000 n \n\
trailer\n\
<< /Size 4 /Root 1 0 R >>\n\
startxref\n\
186\n\
%%EOF\n"
.to_vec();
let cursor = Cursor::new(pdf_with_free);
let mut reader =
OptimizedPdfReader::new(cursor).expect("PDF with free object must parse");
let result = reader.get_object(0, 65535);
if let Ok(obj) = result {
assert!(
matches!(obj, PdfObject::Null),
"Free object should return Null"
);
}
}
#[test]
fn test_find_catalog_when_trailer_missing_root() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let mut reader = OptimizedPdfReader::new(cursor).expect("Minimal PDF must parse");
let result = reader.catalog();
if let Ok(catalog) = result {
assert_eq!(
catalog.get("Type"),
Some(&PdfObject::Name(PdfName("Catalog".to_string()))),
"Catalog must have /Type /Catalog"
);
}
}
#[test]
fn test_load_object_generation_mismatch_strict() {
let data = create_minimal_pdf();
let cursor = Cursor::new(data);
let parse_options = ParseOptions::strict();
let memory_options = MemoryOptions::default();
let mut reader =
OptimizedPdfReader::new_with_options(cursor, parse_options, memory_options)
.expect("Minimal PDF must parse in strict mode");
let result = reader.get_object(1, 5);
assert!(
result.is_err(),
"Strict mode must reject generation number mismatch"
);
if let Err(e) = result {
assert!(
matches!(e, ParseError::InvalidReference(_, _)),
"Expected InvalidReference error, got: {:?}",
e
);
}
}
}
}