#[cfg(test)]
use super::objects::{PdfArray, PdfName};
use super::objects::{PdfDictionary, PdfObject};
use super::page_tree::{PageTree, ParsedPage};
use super::reader::PdfReader;
use super::{ParseError, ParseOptions, ParseResult};
use std::cell::RefCell;
use std::collections::HashMap;
use std::fs::File;
use std::io::{Read, Seek};
use std::path::Path;
use std::rc::Rc;
pub struct ResourceManager {
object_cache: RefCell<HashMap<(u32, u16), PdfObject>>,
}
impl Default for ResourceManager {
fn default() -> Self {
Self::new()
}
}
impl ResourceManager {
pub fn new() -> Self {
Self {
object_cache: RefCell::new(HashMap::new()),
}
}
pub fn get_cached(&self, obj_ref: (u32, u16)) -> Option<PdfObject> {
self.object_cache.borrow().get(&obj_ref).cloned()
}
pub fn cache_object(&self, obj_ref: (u32, u16), obj: PdfObject) {
self.object_cache.borrow_mut().insert(obj_ref, obj);
}
pub fn clear_cache(&self) {
self.object_cache.borrow_mut().clear();
}
}
pub struct PdfDocument<R: Read + Seek> {
reader: RefCell<PdfReader<R>>,
page_tree: RefCell<Option<PageTree>>,
resources: Rc<ResourceManager>,
metadata_cache: RefCell<Option<super::reader::DocumentMetadata>>,
}
impl<R: Read + Seek> PdfDocument<R> {
pub fn new(reader: PdfReader<R>) -> Self {
Self {
reader: RefCell::new(reader),
page_tree: RefCell::new(None),
resources: Rc::new(ResourceManager::new()),
metadata_cache: RefCell::new(None),
}
}
pub fn version(&self) -> ParseResult<String> {
Ok(self.reader.borrow().version().to_string())
}
pub fn options(&self) -> ParseOptions {
self.reader.borrow().options().clone()
}
pub fn page_count(&self) -> ParseResult<u32> {
self.ensure_page_tree()?;
if let Some(pt) = self.page_tree.borrow().as_ref() {
Ok(pt.page_count())
} else {
self.reader.borrow_mut().page_count()
}
}
pub fn metadata(&self) -> ParseResult<super::reader::DocumentMetadata> {
if let Some(metadata) = self.metadata_cache.borrow().as_ref() {
return Ok(metadata.clone());
}
let metadata = self.reader.borrow_mut().metadata()?;
self.metadata_cache.borrow_mut().replace(metadata.clone());
Ok(metadata)
}
fn ensure_page_tree(&self) -> ParseResult<()> {
if self.page_tree.borrow().is_none() {
let pages_dict = self.load_pages_dict()?;
let page_refs = {
let mut reader = self.reader.borrow_mut();
PageTree::flatten_page_tree(&mut *reader, &pages_dict)?
};
let page_tree = PageTree::new_with_flat_index(pages_dict, page_refs);
self.page_tree.borrow_mut().replace(page_tree);
}
Ok(())
}
fn load_pages_dict(&self) -> ParseResult<PdfDictionary> {
let mut reader = self.reader.borrow_mut();
let pages = reader.pages()?;
Ok(pages.clone())
}
pub fn get_page(&self, index: u32) -> ParseResult<ParsedPage> {
self.ensure_page_tree()?;
if let Some(page_tree) = self.page_tree.borrow().as_ref() {
if let Some(page) = page_tree.get_cached_page(index) {
return Ok(page.clone());
}
}
let (page_ref, has_flat_index) = {
let pt_borrow = self.page_tree.borrow();
let pt = pt_borrow.as_ref();
let ref_val = pt.and_then(|pt| pt.get_page_ref(index));
let has_index = pt.map_or(false, |pt| pt.page_count() > 0 || ref_val.is_some());
(ref_val, has_index)
};
let page = if let Some(page_ref) = page_ref {
self.load_page_by_ref(page_ref)?
} else if has_flat_index {
return Err(ParseError::SyntaxError {
position: 0,
message: format!(
"Page index {} out of range (document has {} pages)",
index,
self.page_tree
.borrow()
.as_ref()
.map_or(0, |pt| pt.page_count())
),
});
} else {
self.load_page_at_index(index)?
};
if let Some(page_tree) = self.page_tree.borrow_mut().as_mut() {
page_tree.cache_page(index, page.clone());
}
Ok(page)
}
fn load_page_at_index(&self, index: u32) -> ParseResult<ParsedPage> {
let pages_dict = self.load_pages_dict()?;
let page_info = self.find_page_in_tree(&pages_dict, index, 0, None)?;
Ok(page_info)
}
fn load_page_by_ref(&self, page_ref: (u32, u16)) -> ParseResult<ParsedPage> {
let obj = self.get_object(page_ref.0, page_ref.1)?;
let dict = obj.as_dict().ok_or_else(|| ParseError::SyntaxError {
position: 0,
message: format!(
"Page object {} {} R is not a dictionary",
page_ref.0, page_ref.1
),
})?;
let inherited = self.collect_inherited_attributes(dict);
self.create_parsed_page(page_ref, dict, Some(&inherited))
}
fn collect_inherited_attributes(&self, page_dict: &PdfDictionary) -> PdfDictionary {
let mut inherited = PdfDictionary::new();
let inheritable_keys = ["Resources", "MediaBox", "CropBox", "Rotate"];
let mut current_parent_ref = page_dict.get("Parent").and_then(|p| p.as_reference());
let mut visited: std::collections::HashSet<(u32, u16)> = std::collections::HashSet::new();
while let Some(parent_ref) = current_parent_ref {
if !visited.insert(parent_ref) {
break; }
match self.get_object(parent_ref.0, parent_ref.1) {
Ok(obj) => {
if let Some(parent_dict) = obj.as_dict() {
for key in &inheritable_keys {
if !page_dict.contains_key(key) && !inherited.contains_key(key) {
if let Some(val) = parent_dict.get(key) {
inherited.insert((*key).to_string(), val.clone());
}
}
}
current_parent_ref =
parent_dict.get("Parent").and_then(|p| p.as_reference());
} else {
break;
}
}
Err(_) => break,
}
}
inherited
}
fn find_page_in_tree(
&self,
root_node: &PdfDictionary,
target_index: u32,
initial_current_index: u32,
initial_inherited: Option<&PdfDictionary>,
) -> ParseResult<ParsedPage> {
#[derive(Debug)]
struct WorkItem {
node_dict: PdfDictionary,
node_ref: Option<(u32, u16)>,
current_index: u32,
inherited: Option<PdfDictionary>,
}
let mut work_queue = Vec::new();
work_queue.push(WorkItem {
node_dict: root_node.clone(),
node_ref: None,
current_index: initial_current_index,
inherited: initial_inherited.cloned(),
});
while let Some(work_item) = work_queue.pop() {
let WorkItem {
node_dict,
node_ref,
current_index,
inherited,
} = work_item;
let node_type = node_dict
.get_type()
.or_else(|| {
if node_dict.contains_key("Kids") && node_dict.contains_key("Count") {
Some("Pages")
} else if node_dict.contains_key("Contents")
|| node_dict.contains_key("MediaBox")
{
Some("Page")
} else {
None
}
})
.or_else(|| {
if node_dict.contains_key("Kids") {
Some("Pages")
} else if node_dict.contains_key("Contents")
|| (node_dict.contains_key("MediaBox") && !node_dict.contains_key("Kids"))
{
Some("Page")
} else {
None
}
})
.ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
match node_type {
"Pages" => {
let kids = node_dict
.get("Kids")
.and_then(|obj| obj.as_array())
.or_else(|| {
tracing::debug!(
"Warning: Missing Kids array in Pages node, using empty array"
);
Some(&super::objects::EMPTY_PDF_ARRAY)
})
.ok_or_else(|| ParseError::MissingKey("Kids".to_string()))?;
let mut merged_inherited = inherited.unwrap_or_else(PdfDictionary::new);
for key in ["Resources", "MediaBox", "CropBox", "Rotate"] {
if let Some(value) = node_dict.get(key) {
if !merged_inherited.contains_key(key) {
merged_inherited.insert(key.to_string(), value.clone());
}
}
}
let mut current_idx = current_index;
let mut pending_kids = Vec::new();
for kid_ref in &kids.0 {
let kid_ref =
kid_ref
.as_reference()
.ok_or_else(|| ParseError::SyntaxError {
position: 0,
message: "Kids array must contain references".to_string(),
})?;
let kid_obj = self.get_object(kid_ref.0, kid_ref.1)?;
let kid_dict = match kid_obj.as_dict() {
Some(dict) => dict,
None => {
tracing::debug!(
"Warning: Page tree node {} {} R is not a dictionary, skipping",
kid_ref.0,
kid_ref.1
);
current_idx += 1; continue;
}
};
let kid_type = kid_dict
.get_type()
.or_else(|| {
if kid_dict.contains_key("Kids") && kid_dict.contains_key("Count") {
Some("Pages")
} else if kid_dict.contains_key("Contents")
|| kid_dict.contains_key("MediaBox")
{
Some("Page")
} else {
None
}
})
.ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
let count = if kid_type == "Pages" {
kid_dict
.get("Count")
.and_then(|obj| obj.as_integer())
.unwrap_or(1) as u32
} else {
1
};
if target_index < current_idx + count {
if kid_type == "Page" {
return self.create_parsed_page(
kid_ref,
kid_dict,
Some(&merged_inherited),
);
} else {
pending_kids.push(WorkItem {
node_dict: kid_dict.clone(),
node_ref: Some(kid_ref),
current_index: current_idx,
inherited: Some(merged_inherited.clone()),
});
break; }
}
current_idx += count;
}
work_queue.extend(pending_kids.into_iter().rev());
}
"Page" => {
if target_index != current_index {
return Err(ParseError::SyntaxError {
position: 0,
message: "Page index mismatch".to_string(),
});
}
if let Some(page_ref) = node_ref {
return self.create_parsed_page(page_ref, &node_dict, inherited.as_ref());
} else {
return Err(ParseError::SyntaxError {
position: 0,
message: "Direct page object without reference".to_string(),
});
}
}
_ => {
return Err(ParseError::SyntaxError {
position: 0,
message: format!("Invalid page tree node type: {node_type}"),
});
}
}
}
tracing::debug!(
"Warning: Page {} not found in tree, attempting direct lookup",
target_index
);
for obj_num in 1..500 {
if let Ok(obj) = self.reader.borrow_mut().get_object(obj_num, 0) {
if let Some(dict) = obj.as_dict() {
if let Some(obj_type) = dict.get("Type").and_then(|t| t.as_name()) {
if obj_type.0 == "Page" {
return self.create_parsed_page((obj_num, 0), dict, None);
}
}
}
}
}
Err(ParseError::SyntaxError {
position: 0,
message: format!("Page {} not found in tree or document", target_index),
})
}
fn create_parsed_page(
&self,
obj_ref: (u32, u16),
page_dict: &PdfDictionary,
inherited: Option<&PdfDictionary>,
) -> ParseResult<ParsedPage> {
let media_box = match self.get_rectangle(page_dict, inherited, "MediaBox")? {
Some(mb) => mb,
None => {
#[cfg(debug_assertions)]
tracing::debug!(
"Warning: Page {} {} R missing MediaBox, using default Letter size",
obj_ref.0,
obj_ref.1
);
[0.0, 0.0, 612.0, 792.0]
}
};
let crop_box = self.get_rectangle(page_dict, inherited, "CropBox")?;
let rotation = self
.get_integer(page_dict, inherited, "Rotate")?
.unwrap_or(0) as i32;
let inherited_resources = if let Some(inherited) = inherited {
inherited
.get("Resources")
.and_then(|r| r.as_dict())
.cloned()
} else {
None
};
let annotations = page_dict
.get("Annots")
.and_then(|obj| obj.as_array())
.cloned();
Ok(ParsedPage {
obj_ref,
dict: page_dict.clone(),
inherited_resources,
media_box,
crop_box,
rotation,
annotations,
})
}
fn get_rectangle(
&self,
node: &PdfDictionary,
inherited: Option<&PdfDictionary>,
key: &str,
) -> ParseResult<Option<[f64; 4]>> {
let array = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
if let Some(array) = array.and_then(|obj| obj.as_array()) {
if array.len() != 4 {
return Err(ParseError::SyntaxError {
position: 0,
message: format!("{key} must have 4 elements"),
});
}
let rect = [
array.0[0].as_real().unwrap_or(0.0),
array.0[1].as_real().unwrap_or(0.0),
array.0[2].as_real().unwrap_or(0.0),
array.0[3].as_real().unwrap_or(0.0),
];
Ok(Some(rect))
} else {
Ok(None)
}
}
fn get_integer(
&self,
node: &PdfDictionary,
inherited: Option<&PdfDictionary>,
key: &str,
) -> ParseResult<Option<i64>> {
let value = node.get(key).or_else(|| inherited.and_then(|i| i.get(key)));
Ok(value.and_then(|obj| obj.as_integer()))
}
pub fn get_object(&self, obj_num: u32, gen_num: u16) -> ParseResult<PdfObject> {
if let Some(obj) = self.resources.get_cached((obj_num, gen_num)) {
return Ok(obj);
}
let obj = {
let mut reader = self.reader.borrow_mut();
reader.get_object(obj_num, gen_num)?.clone()
};
self.resources.cache_object((obj_num, gen_num), obj.clone());
Ok(obj)
}
pub fn resolve(&self, obj: &PdfObject) -> ParseResult<PdfObject> {
match obj {
PdfObject::Reference(obj_num, gen_num) => self.get_object(*obj_num, *gen_num),
_ => Ok(obj.clone()),
}
}
pub fn get_page_resources<'a>(
&self,
page: &'a ParsedPage,
) -> ParseResult<Option<&'a PdfDictionary>> {
Ok(page.get_resources())
}
pub fn get_page_content_streams(&self, page: &ParsedPage) -> ParseResult<Vec<Vec<u8>>> {
let mut streams = Vec::new();
let options = self.options();
if let Some(contents) = page.dict.get("Contents") {
let resolved_contents = self.resolve(contents)?;
match &resolved_contents {
PdfObject::Stream(stream) => {
streams.push(stream.decode(&options)?);
}
PdfObject::Array(array) => {
for item in &array.0 {
let resolved = self.resolve(item)?;
if let PdfObject::Stream(stream) = resolved {
streams.push(stream.decode(&options)?);
}
}
}
_ => {
return Err(ParseError::SyntaxError {
position: 0,
message: "Contents must be a stream or array of streams".to_string(),
})
}
}
}
Ok(streams)
}
pub fn extract_text(&self) -> ParseResult<Vec<crate::text::ExtractedText>> {
let mut extractor = crate::text::TextExtractor::new();
extractor.extract_from_document(self)
}
pub fn extract_text_from_page(
&self,
page_index: u32,
) -> ParseResult<crate::text::ExtractedText> {
let mut extractor = crate::text::TextExtractor::new();
extractor.extract_from_page(self, page_index)
}
pub fn extract_text_from_page_with_options(
&self,
page_index: u32,
options: crate::text::ExtractionOptions,
) -> ParseResult<crate::text::ExtractedText> {
let mut extractor = crate::text::TextExtractor::with_options(options);
extractor.extract_from_page(self, page_index)
}
pub fn extract_text_with_options(
&self,
options: crate::text::ExtractionOptions,
) -> ParseResult<Vec<crate::text::ExtractedText>> {
let mut extractor = crate::text::TextExtractor::with_options(options);
extractor.extract_from_document(self)
}
pub fn get_page_annotations(&self, page_index: u32) -> ParseResult<Vec<PdfDictionary>> {
let page = self.get_page(page_index)?;
if let Some(annots_array) = page.get_annotations() {
let mut annotations = Vec::new();
let mut reader = self.reader.borrow_mut();
for annot_ref in &annots_array.0 {
if let Some(ref_nums) = annot_ref.as_reference() {
match reader.get_object(ref_nums.0, ref_nums.1) {
Ok(obj) => {
if let Some(dict) = obj.as_dict() {
annotations.push(dict.clone());
}
}
Err(_) => {
continue;
}
}
}
}
Ok(annotations)
} else {
Ok(Vec::new())
}
}
pub fn get_all_annotations(&self) -> ParseResult<Vec<(u32, Vec<PdfDictionary>)>> {
let page_count = self.page_count()?;
let mut all_annotations = Vec::new();
for i in 0..page_count {
let annotations = self.get_page_annotations(i)?;
if !annotations.is_empty() {
all_annotations.push((i, annotations));
}
}
Ok(all_annotations)
}
#[allow(deprecated)]
pub fn to_markdown(&self) -> crate::error::Result<String> {
crate::ai::export_to_markdown(self)
}
pub fn to_element_markdown(&self) -> ParseResult<String> {
let elements = self.partition()?;
let exporter = crate::pipeline::export::ElementMarkdownExporter::default();
Ok(exporter.export(&elements))
}
#[allow(deprecated)]
pub fn to_contextual(&self) -> crate::error::Result<String> {
crate::ai::export_to_contextual(self)
}
#[cfg(feature = "semantic")]
#[allow(deprecated)]
pub fn to_json(&self) -> crate::error::Result<String> {
crate::ai::export_to_json(self)
}
pub fn rag_chunks(&self) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
self.rag_chunks_with(crate::pipeline::HybridChunkConfig::default())
}
pub fn rag_chunks_with(
&self,
config: crate::pipeline::HybridChunkConfig,
) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
let elements = self.partition()?;
let chunker = crate::pipeline::HybridChunker::new(config);
let hybrid_chunks = chunker.chunk(&elements);
let rag_chunks = hybrid_chunks
.iter()
.enumerate()
.map(|(idx, hc)| crate::pipeline::RagChunk::from_hybrid_chunk(idx, hc))
.collect();
Ok(rag_chunks)
}
pub fn rag_chunks_with_profile(
&self,
profile: crate::pipeline::ExtractionProfile,
) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
let elements = self.partition_with_profile(profile)?;
let chunker = crate::pipeline::HybridChunker::default();
let hybrid_chunks = chunker.chunk(&elements);
let rag_chunks = hybrid_chunks
.iter()
.enumerate()
.map(|(idx, hc)| crate::pipeline::RagChunk::from_hybrid_chunk(idx, hc))
.collect();
Ok(rag_chunks)
}
pub fn rag_chunks_with_profile_config(
&self,
profile: crate::pipeline::ExtractionProfile,
config: crate::pipeline::HybridChunkConfig,
) -> ParseResult<Vec<crate::pipeline::RagChunk>> {
let elements = self.partition_with_profile(profile)?;
let chunker = crate::pipeline::HybridChunker::new(config);
let hybrid_chunks = chunker.chunk(&elements);
Ok(hybrid_chunks
.iter()
.enumerate()
.map(|(idx, hc)| crate::pipeline::RagChunk::from_hybrid_chunk(idx, hc))
.collect())
}
#[cfg(feature = "semantic")]
pub fn rag_chunks_json(&self) -> ParseResult<String> {
let chunks = self.rag_chunks()?;
serde_json::to_string(&chunks).map_err(|e| ParseError::SerializationError(e.to_string()))
}
#[deprecated(
since = "2.2.0",
note = "Use rag_chunks() for structure-aware RAG chunking"
)]
#[allow(deprecated)]
pub fn chunk(
&self,
target_tokens: usize,
) -> crate::error::Result<Vec<crate::ai::DocumentChunk>> {
let overlap = target_tokens / 10;
self.chunk_with(target_tokens, overlap)
}
#[deprecated(
since = "2.2.0",
note = "Use rag_chunks_with() for structure-aware RAG chunking"
)]
pub fn chunk_with(
&self,
target_tokens: usize,
overlap: usize,
) -> crate::error::Result<Vec<crate::ai::DocumentChunk>> {
let chunker = crate::ai::DocumentChunker::new(target_tokens, overlap);
let extracted = self.extract_text()?;
let page_texts: Vec<(usize, String)> = extracted
.iter()
.enumerate()
.map(|(i, t)| (i + 1, t.text.clone()))
.collect();
chunker
.chunk_text_with_pages(&page_texts)
.map_err(|e| crate::error::PdfError::InvalidStructure(e.to_string()))
}
pub fn partition(&self) -> ParseResult<Vec<crate::pipeline::Element>> {
self.partition_with(crate::pipeline::PartitionConfig::default())
}
pub fn partition_with(
&self,
config: crate::pipeline::PartitionConfig,
) -> ParseResult<Vec<crate::pipeline::Element>> {
let options = crate::text::ExtractionOptions {
preserve_layout: true,
..Default::default()
};
self.do_partition_pages(options, config)
}
pub fn partition_with_profile(
&self,
profile: crate::pipeline::ExtractionProfile,
) -> ParseResult<Vec<crate::pipeline::Element>> {
let profile_cfg = profile.config();
let options = crate::text::ExtractionOptions {
preserve_layout: true,
space_threshold: profile_cfg.extraction.space_threshold,
detect_columns: profile_cfg.extraction.detect_columns,
..crate::text::ExtractionOptions::default()
};
self.do_partition_pages(options, profile_cfg.partition)
}
fn do_partition_pages(
&self,
options: crate::text::ExtractionOptions,
config: crate::pipeline::PartitionConfig,
) -> ParseResult<Vec<crate::pipeline::Element>> {
let pages = self.extract_text_with_options(options)?;
let partitioner = crate::pipeline::Partitioner::new(config);
let mut all_elements = Vec::new();
for (page_idx, page_text) in pages.iter().enumerate() {
let page_idx_u32 = u32::try_from(page_idx).map_err(|_| ParseError::SyntaxError {
position: 0,
message: format!("Page index {} exceeds u32 range", page_idx),
})?;
let page_height = self
.get_page(page_idx_u32)
.map(|p| p.height())
.unwrap_or(842.0);
let elements =
partitioner.partition_fragments(&page_text.fragments, page_idx_u32, page_height);
all_elements.extend(elements);
}
Ok(all_elements)
}
pub fn partition_graph(
&self,
config: crate::pipeline::PartitionConfig,
) -> ParseResult<(Vec<crate::pipeline::Element>, crate::pipeline::ElementGraph)> {
let elements = self.partition_with(config)?;
let graph = crate::pipeline::ElementGraph::build(&elements);
Ok((elements, graph))
}
}
impl PdfDocument<File> {
pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
PdfReader::open_document(path)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::objects::{PdfObject, PdfString};
use std::io::Cursor;
fn create_minimal_pdf() -> Vec<u8> {
let mut pdf = Vec::new();
pdf.extend_from_slice(b"%PDF-1.4\n");
pdf.extend_from_slice(b"1 0 obj\n");
pdf.extend_from_slice(b"<< /Type /Catalog /Pages 2 0 R >>\n");
pdf.extend_from_slice(b"endobj\n");
pdf.extend_from_slice(b"2 0 obj\n");
pdf.extend_from_slice(b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>\n");
pdf.extend_from_slice(b"endobj\n");
pdf.extend_from_slice(b"3 0 obj\n");
pdf.extend_from_slice(
b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> >>\n",
);
pdf.extend_from_slice(b"endobj\n");
let xref_pos = pdf.len();
pdf.extend_from_slice(b"xref\n");
pdf.extend_from_slice(b"0 4\n");
pdf.extend_from_slice(b"0000000000 65535 f \n");
pdf.extend_from_slice(b"0000000009 00000 n \n");
pdf.extend_from_slice(b"0000000058 00000 n \n");
pdf.extend_from_slice(b"0000000115 00000 n \n");
pdf.extend_from_slice(b"trailer\n");
pdf.extend_from_slice(b"<< /Size 4 /Root 1 0 R >>\n");
pdf.extend_from_slice(b"startxref\n");
pdf.extend_from_slice(format!("{xref_pos}\n").as_bytes());
pdf.extend_from_slice(b"%%EOF\n");
pdf
}
fn create_pdf_with_metadata() -> Vec<u8> {
let mut pdf = Vec::new();
pdf.extend_from_slice(b"%PDF-1.5\n");
let obj1_pos = pdf.len();
pdf.extend_from_slice(b"1 0 obj\n");
pdf.extend_from_slice(b"<< /Type /Catalog /Pages 2 0 R >>\n");
pdf.extend_from_slice(b"endobj\n");
let obj2_pos = pdf.len();
pdf.extend_from_slice(b"2 0 obj\n");
pdf.extend_from_slice(b"<< /Type /Pages /Kids [] /Count 0 >>\n");
pdf.extend_from_slice(b"endobj\n");
let obj3_pos = pdf.len();
pdf.extend_from_slice(b"3 0 obj\n");
pdf.extend_from_slice(
b"<< /Title (Test Document) /Author (Test Author) /Subject (Test Subject) >>\n",
);
pdf.extend_from_slice(b"endobj\n");
let xref_pos = pdf.len();
pdf.extend_from_slice(b"xref\n");
pdf.extend_from_slice(b"0 4\n");
pdf.extend_from_slice(b"0000000000 65535 f \n");
pdf.extend_from_slice(format!("{obj1_pos:010} 00000 n \n").as_bytes());
pdf.extend_from_slice(format!("{obj2_pos:010} 00000 n \n").as_bytes());
pdf.extend_from_slice(format!("{obj3_pos:010} 00000 n \n").as_bytes());
pdf.extend_from_slice(b"trailer\n");
pdf.extend_from_slice(b"<< /Size 4 /Root 1 0 R /Info 3 0 R >>\n");
pdf.extend_from_slice(b"startxref\n");
pdf.extend_from_slice(format!("{xref_pos}\n").as_bytes());
pdf.extend_from_slice(b"%%EOF\n");
pdf
}
#[test]
fn test_pdf_document_new() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
assert!(document.page_tree.borrow().is_none());
assert!(document.metadata_cache.borrow().is_none());
}
#[test]
fn test_version() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let version = document.version().unwrap();
assert_eq!(version, "1.4");
}
#[test]
fn test_page_count() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let count = document.page_count().unwrap();
assert_eq!(count, 1);
}
#[test]
fn test_metadata() {
let pdf_data = create_pdf_with_metadata();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let metadata = document.metadata().unwrap();
assert_eq!(metadata.title, Some("Test Document".to_string()));
assert_eq!(metadata.author, Some("Test Author".to_string()));
assert_eq!(metadata.subject, Some("Test Subject".to_string()));
let metadata2 = document.metadata().unwrap();
assert_eq!(metadata.title, metadata2.title);
}
#[test]
fn test_get_page() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let page = document.get_page(0).unwrap();
assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]);
let page2 = document.get_page(0).unwrap();
assert_eq!(page.media_box, page2.media_box);
}
#[test]
fn test_get_page_out_of_bounds() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let result = document.get_page(10);
if result.is_err() {
assert!(result.unwrap_err().to_string().contains("Page"));
} else {
let _page = result.unwrap();
}
}
#[test]
fn test_resource_manager_caching() {
let resources = ResourceManager::new();
let obj_ref = (1, 0);
let obj = PdfObject::String(PdfString("Test".as_bytes().to_vec()));
assert!(resources.get_cached(obj_ref).is_none());
resources.cache_object(obj_ref, obj.clone());
let cached = resources.get_cached(obj_ref).unwrap();
assert_eq!(cached, obj);
resources.clear_cache();
assert!(resources.get_cached(obj_ref).is_none());
}
#[test]
fn test_get_object() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let catalog = document.get_object(1, 0).unwrap();
if let PdfObject::Dictionary(dict) = catalog {
if let Some(PdfObject::Name(name)) = dict.get("Type") {
assert_eq!(name.0, "Catalog");
} else {
panic!("Expected /Type name");
}
} else {
panic!("Expected dictionary object");
}
}
#[test]
fn test_resolve_reference() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let ref_obj = PdfObject::Reference(1, 0);
let resolved = document.resolve(&ref_obj).unwrap();
if let PdfObject::Dictionary(dict) = resolved {
if let Some(PdfObject::Name(name)) = dict.get("Type") {
assert_eq!(name.0, "Catalog");
} else {
panic!("Expected /Type name");
}
} else {
panic!("Expected dictionary object");
}
}
#[test]
fn test_resolve_non_reference() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let obj = PdfObject::String(PdfString("Test".as_bytes().to_vec()));
let resolved = document.resolve(&obj).unwrap();
assert_eq!(resolved, obj);
}
#[test]
fn test_invalid_pdf_data() {
let invalid_data = b"This is not a PDF";
let cursor = Cursor::new(invalid_data.to_vec());
let result = PdfReader::new(cursor);
assert!(result.is_err());
}
#[test]
fn test_empty_page_tree() {
let pdf_data = create_pdf_with_metadata(); let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let count = document.page_count().unwrap();
assert_eq!(count, 0);
let result = document.get_page(0);
assert!(result.is_err());
}
#[test]
fn test_extract_text_empty_document() {
let pdf_data = create_pdf_with_metadata();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let text = document.extract_text().unwrap();
assert!(text.is_empty());
}
#[test]
fn test_concurrent_access() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let version = document.version().unwrap();
let count = document.page_count().unwrap();
let page = document.get_page(0).unwrap();
assert_eq!(version, "1.4");
assert_eq!(count, 1);
assert_eq!(page.media_box[2], 612.0);
}
mod comprehensive_tests {
use super::*;
#[test]
fn test_resource_manager_default() {
let resources = ResourceManager::default();
assert!(resources.get_cached((1, 0)).is_none());
}
#[test]
fn test_resource_manager_multiple_objects() {
let resources = ResourceManager::new();
resources.cache_object((1, 0), PdfObject::Integer(42));
resources.cache_object((2, 0), PdfObject::Boolean(true));
resources.cache_object(
(3, 0),
PdfObject::String(PdfString("test".as_bytes().to_vec())),
);
assert!(resources.get_cached((1, 0)).is_some());
assert!(resources.get_cached((2, 0)).is_some());
assert!(resources.get_cached((3, 0)).is_some());
resources.clear_cache();
assert!(resources.get_cached((1, 0)).is_none());
assert!(resources.get_cached((2, 0)).is_none());
assert!(resources.get_cached((3, 0)).is_none());
}
#[test]
fn test_resource_manager_object_overwrite() {
let resources = ResourceManager::new();
resources.cache_object((1, 0), PdfObject::Integer(42));
assert_eq!(resources.get_cached((1, 0)), Some(PdfObject::Integer(42)));
resources.cache_object((1, 0), PdfObject::Boolean(true));
assert_eq!(resources.get_cached((1, 0)), Some(PdfObject::Boolean(true)));
}
#[test]
fn test_get_object_caching() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let obj1 = document.get_object(1, 0).unwrap();
let obj2 = document.get_object(1, 0).unwrap();
assert_eq!(obj1, obj2);
assert!(document.resources.get_cached((1, 0)).is_some());
}
#[test]
fn test_get_object_different_generations() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let _obj1 = document.get_object(1, 0).unwrap();
let result = document.get_object(1, 1);
assert!(result.is_err());
assert!(document.resources.get_cached((1, 0)).is_some());
}
#[test]
fn test_get_object_nonexistent() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let result = document.get_object(999, 0);
assert!(result.is_err());
}
#[test]
fn test_resolve_nested_references() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let ref_obj = PdfObject::Reference(2, 0);
let resolved = document.resolve(&ref_obj).unwrap();
if let PdfObject::Dictionary(dict) = resolved {
if let Some(PdfObject::Name(name)) = dict.get("Type") {
assert_eq!(name.0, "Pages");
}
}
}
#[test]
fn test_resolve_various_object_types() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let test_objects = vec![
PdfObject::Integer(42),
PdfObject::Boolean(true),
PdfObject::String(PdfString("test".as_bytes().to_vec())),
PdfObject::Real(3.14),
PdfObject::Null,
];
for obj in test_objects {
let resolved = document.resolve(&obj).unwrap();
assert_eq!(resolved, obj);
}
}
#[test]
fn test_get_page_cached() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let page1 = document.get_page(0).unwrap();
let page2 = document.get_page(0).unwrap();
assert_eq!(page1.media_box, page2.media_box);
assert_eq!(page1.rotation, page2.rotation);
assert_eq!(page1.obj_ref, page2.obj_ref);
}
#[test]
fn test_metadata_caching() {
let pdf_data = create_pdf_with_metadata();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let meta1 = document.metadata().unwrap();
let meta2 = document.metadata().unwrap();
assert_eq!(meta1.title, meta2.title);
assert_eq!(meta1.author, meta2.author);
assert_eq!(meta1.subject, meta2.subject);
assert_eq!(meta1.version, meta2.version);
}
#[test]
fn test_page_tree_initialization() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
assert!(document.page_tree.borrow().is_none());
let _count = document.page_count().unwrap();
let _page = document.get_page(0).unwrap();
}
#[test]
fn test_get_page_resources() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let page = document.get_page(0).unwrap();
let resources = document.get_page_resources(&page).unwrap();
assert!(resources.is_some());
}
#[test]
fn test_get_page_content_streams_empty() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let page = document.get_page(0).unwrap();
let streams = document.get_page_content_streams(&page).unwrap();
assert!(streams.is_empty());
}
#[test]
fn test_extract_text_from_page() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let result = document.extract_text_from_page(0);
assert!(result.is_ok());
}
#[test]
fn test_extract_text_from_page_out_of_bounds() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let result = document.extract_text_from_page(999);
if result.is_err() {
assert!(result.unwrap_err().to_string().contains("Page"));
} else {
let _text = result.unwrap();
}
}
#[test]
fn test_extract_text_with_options() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let options = crate::text::ExtractionOptions {
preserve_layout: true,
space_threshold: 0.5,
newline_threshold: 15.0,
..Default::default()
};
let result = document.extract_text_with_options(options);
assert!(result.is_ok());
}
#[test]
fn test_version_different_pdf_versions() {
let versions = vec!["1.3", "1.4", "1.5", "1.6", "1.7"];
for version in versions {
let mut pdf_data = Vec::new();
pdf_data.extend_from_slice(format!("%PDF-{version}\n").as_bytes());
let obj1_pos = pdf_data.len();
pdf_data.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
let obj2_pos = pdf_data.len();
pdf_data
.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
let xref_pos = pdf_data.len();
pdf_data.extend_from_slice(b"xref\n");
pdf_data.extend_from_slice(b"0 3\n");
pdf_data.extend_from_slice(b"0000000000 65535 f \n");
pdf_data.extend_from_slice(format!("{obj1_pos:010} 00000 n \n").as_bytes());
pdf_data.extend_from_slice(format!("{obj2_pos:010} 00000 n \n").as_bytes());
pdf_data.extend_from_slice(b"trailer\n");
pdf_data.extend_from_slice(b"<< /Size 3 /Root 1 0 R >>\n");
pdf_data.extend_from_slice(b"startxref\n");
pdf_data.extend_from_slice(format!("{xref_pos}\n").as_bytes());
pdf_data.extend_from_slice(b"%%EOF\n");
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let pdf_version = document.version().unwrap();
assert_eq!(pdf_version, version);
}
}
#[test]
fn test_page_count_zero() {
let pdf_data = create_pdf_with_metadata(); let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let count = document.page_count().unwrap();
assert_eq!(count, 0);
}
#[test]
fn test_multiple_object_access() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let catalog = document.get_object(1, 0).unwrap();
let pages = document.get_object(2, 0).unwrap();
let page = document.get_object(3, 0).unwrap();
assert_ne!(catalog, pages);
assert_ne!(pages, page);
assert_ne!(catalog, page);
}
#[test]
fn test_error_handling_invalid_object_reference() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let invalid_ref = PdfObject::Reference(999, 0);
let result = document.resolve(&invalid_ref);
assert!(result.is_err());
}
#[test]
fn test_concurrent_metadata_access() {
let pdf_data = create_pdf_with_metadata();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let metadata = document.metadata().unwrap();
let version = document.version().unwrap();
let count = document.page_count().unwrap();
assert_eq!(metadata.title, Some("Test Document".to_string()));
assert_eq!(version, "1.5");
assert_eq!(count, 0);
}
#[test]
fn test_page_properties_comprehensive() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let page = document.get_page(0).unwrap();
assert_eq!(page.media_box, [0.0, 0.0, 612.0, 792.0]);
assert_eq!(page.crop_box, None);
assert_eq!(page.rotation, 0);
assert_eq!(page.obj_ref, (3, 0));
assert_eq!(page.width(), 612.0);
assert_eq!(page.height(), 792.0);
}
#[test]
fn test_memory_usage_efficiency() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
for _ in 0..10 {
let _page = document.get_page(0).unwrap();
}
let page_count = document.page_count().unwrap();
assert_eq!(page_count, 1);
}
#[test]
fn test_reader_borrow_safety() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let version = document.version().unwrap();
let count = document.page_count().unwrap();
let metadata = document.metadata().unwrap();
assert_eq!(version, "1.4");
assert_eq!(count, 1);
assert!(metadata.title.is_none());
}
#[test]
fn test_cache_consistency() {
let pdf_data = create_minimal_pdf();
let cursor = Cursor::new(pdf_data);
let reader = PdfReader::new(cursor).unwrap();
let document = PdfDocument::new(reader);
let obj1 = document.get_object(1, 0).unwrap();
let cached = document.resources.get_cached((1, 0)).unwrap();
assert_eq!(obj1, cached);
document.resources.clear_cache();
let obj2 = document.get_object(1, 0).unwrap();
assert_eq!(obj1, obj2);
}
}
#[test]
fn test_resource_manager_new() {
let resources = ResourceManager::new();
assert!(resources.get_cached((1, 0)).is_none());
}
#[test]
fn test_resource_manager_cache_and_get() {
let resources = ResourceManager::new();
let obj = PdfObject::Integer(42);
resources.cache_object((10, 0), obj.clone());
let cached = resources.get_cached((10, 0));
assert!(cached.is_some());
assert_eq!(cached.unwrap(), obj);
assert!(resources.get_cached((11, 0)).is_none());
}
#[test]
fn test_resource_manager_clear_cache() {
let resources = ResourceManager::new();
resources.cache_object((1, 0), PdfObject::Integer(1));
resources.cache_object((2, 0), PdfObject::Integer(2));
resources.cache_object((3, 0), PdfObject::Integer(3));
assert!(resources.get_cached((1, 0)).is_some());
assert!(resources.get_cached((2, 0)).is_some());
assert!(resources.get_cached((3, 0)).is_some());
resources.clear_cache();
assert!(resources.get_cached((1, 0)).is_none());
assert!(resources.get_cached((2, 0)).is_none());
assert!(resources.get_cached((3, 0)).is_none());
}
#[test]
fn test_resource_manager_overwrite_cached() {
let resources = ResourceManager::new();
resources.cache_object((1, 0), PdfObject::Integer(42));
assert_eq!(
resources.get_cached((1, 0)).unwrap(),
PdfObject::Integer(42)
);
resources.cache_object((1, 0), PdfObject::Integer(100));
assert_eq!(
resources.get_cached((1, 0)).unwrap(),
PdfObject::Integer(100)
);
}
#[test]
fn test_resource_manager_multiple_generations() {
let resources = ResourceManager::new();
resources.cache_object((1, 0), PdfObject::Integer(10));
resources.cache_object((1, 1), PdfObject::Integer(11));
resources.cache_object((1, 2), PdfObject::Integer(12));
assert_eq!(
resources.get_cached((1, 0)).unwrap(),
PdfObject::Integer(10)
);
assert_eq!(
resources.get_cached((1, 1)).unwrap(),
PdfObject::Integer(11)
);
assert_eq!(
resources.get_cached((1, 2)).unwrap(),
PdfObject::Integer(12)
);
}
#[test]
fn test_resource_manager_cache_complex_objects() {
let resources = ResourceManager::new();
resources.cache_object((1, 0), PdfObject::Boolean(true));
resources.cache_object((2, 0), PdfObject::Real(3.14159));
resources.cache_object(
(3, 0),
PdfObject::String(PdfString::new(b"Hello PDF".to_vec())),
);
resources.cache_object((4, 0), PdfObject::Name(PdfName::new("Type".to_string())));
let mut dict = PdfDictionary::new();
dict.insert(
"Key".to_string(),
PdfObject::String(PdfString::new(b"Value".to_vec())),
);
resources.cache_object((5, 0), PdfObject::Dictionary(dict));
let array = vec![PdfObject::Integer(1), PdfObject::Integer(2)];
resources.cache_object((6, 0), PdfObject::Array(PdfArray(array)));
assert_eq!(
resources.get_cached((1, 0)).unwrap(),
PdfObject::Boolean(true)
);
assert_eq!(
resources.get_cached((2, 0)).unwrap(),
PdfObject::Real(3.14159)
);
assert_eq!(
resources.get_cached((3, 0)).unwrap(),
PdfObject::String(PdfString::new(b"Hello PDF".to_vec()))
);
assert_eq!(
resources.get_cached((4, 0)).unwrap(),
PdfObject::Name(PdfName::new("Type".to_string()))
);
assert!(matches!(
resources.get_cached((5, 0)).unwrap(),
PdfObject::Dictionary(_)
));
assert!(matches!(
resources.get_cached((6, 0)).unwrap(),
PdfObject::Array(_)
));
}
}