1use super::{ParseError, ParseResult};
7use super::reader::PdfReader;
8use super::page_tree::{PageTree, ParsedPage};
9use super::objects::{PdfObject, PdfDictionary};
10use std::io::{Read, Seek};
11use std::cell::RefCell;
12use std::rc::Rc;
13use std::collections::HashMap;
14
15pub struct ResourceManager {
17 object_cache: RefCell<HashMap<(u32, u16), PdfObject>>,
19}
20
21impl ResourceManager {
22 pub fn new() -> Self {
24 Self {
25 object_cache: RefCell::new(HashMap::new()),
26 }
27 }
28
29 pub fn get_cached(&self, obj_ref: (u32, u16)) -> Option<PdfObject> {
31 self.object_cache.borrow().get(&obj_ref).cloned()
32 }
33
34 pub fn cache_object(&self, obj_ref: (u32, u16), obj: PdfObject) {
36 self.object_cache.borrow_mut().insert(obj_ref, obj);
37 }
38
39 pub fn clear_cache(&self) {
41 self.object_cache.borrow_mut().clear();
42 }
43}
44
45pub struct PdfDocument<R: Read + Seek> {
47 reader: RefCell<PdfReader<R>>,
49 page_tree: RefCell<Option<PageTree>>,
51 resources: Rc<ResourceManager>,
53 metadata_cache: RefCell<Option<super::reader::DocumentMetadata>>,
55}
56
57impl<R: Read + Seek> PdfDocument<R> {
58 pub fn new(reader: PdfReader<R>) -> Self {
60 Self {
61 reader: RefCell::new(reader),
62 page_tree: RefCell::new(None),
63 resources: Rc::new(ResourceManager::new()),
64 metadata_cache: RefCell::new(None),
65 }
66 }
67
68 pub fn version(&self) -> ParseResult<String> {
70 Ok(self.reader.borrow().version().to_string())
71 }
72
73 pub fn page_count(&self) -> ParseResult<u32> {
75 self.reader.borrow_mut().page_count()
76 }
77
78 pub fn metadata(&self) -> ParseResult<super::reader::DocumentMetadata> {
80 if let Some(metadata) = self.metadata_cache.borrow().as_ref() {
82 return Ok(metadata.clone());
83 }
84
85 let metadata = self.reader.borrow_mut().metadata()?;
87 self.metadata_cache.borrow_mut().replace(metadata.clone());
88 Ok(metadata)
89 }
90
91 fn ensure_page_tree(&self) -> ParseResult<()> {
93 if self.page_tree.borrow().is_none() {
94 let page_count = self.page_count()?;
95 let pages_dict = self.load_pages_dict()?;
96 let page_tree = PageTree::new_with_pages_dict(page_count, pages_dict);
97 self.page_tree.borrow_mut().replace(page_tree);
98 }
99 Ok(())
100 }
101
102 fn load_pages_dict(&self) -> ParseResult<PdfDictionary> {
104 let mut reader = self.reader.borrow_mut();
105 let pages = reader.pages()?;
106 Ok(pages.clone())
107 }
108
109 pub fn get_page(&self, index: u32) -> ParseResult<ParsedPage> {
111 self.ensure_page_tree()?;
112
113 if let Some(page_tree) = self.page_tree.borrow().as_ref() {
115 if let Some(page) = page_tree.get_cached_page(index) {
116 return Ok(page.clone());
117 }
118 }
119
120 let page = self.load_page_at_index(index)?;
122
123 if let Some(page_tree) = self.page_tree.borrow_mut().as_mut() {
125 page_tree.cache_page(index, page.clone());
126 }
127
128 Ok(page)
129 }
130
131 fn load_page_at_index(&self, index: u32) -> ParseResult<ParsedPage> {
133 let pages_dict = self.load_pages_dict()?;
135
136 let page_info = self.find_page_in_tree(&pages_dict, index, 0, None)?;
138
139 Ok(page_info)
140 }
141
142 fn find_page_in_tree(
144 &self,
145 node: &PdfDictionary,
146 target_index: u32,
147 current_index: u32,
148 inherited: Option<&PdfDictionary>,
149 ) -> ParseResult<ParsedPage> {
150 let node_type = node.get_type()
151 .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
152
153 match node_type {
154 "Pages" => {
155 let kids = node.get("Kids")
157 .and_then(|obj| obj.as_array())
158 .ok_or_else(|| ParseError::MissingKey("Kids".to_string()))?;
159
160 let mut merged_inherited = inherited.cloned().unwrap_or_else(PdfDictionary::new);
162
163 for key in ["Resources", "MediaBox", "CropBox", "Rotate"] {
165 if let Some(value) = node.get(key) {
166 if !merged_inherited.contains_key(key) {
167 merged_inherited.insert(key.to_string(), value.clone());
168 }
169 }
170 }
171
172 let mut current_idx = current_index;
174 for kid_ref in &kids.0 {
175 let kid_ref = kid_ref.as_reference()
176 .ok_or_else(|| ParseError::SyntaxError {
177 position: 0,
178 message: "Kids array must contain references".to_string(),
179 })?;
180
181 let kid_obj = self.get_object(kid_ref.0, kid_ref.1)?;
183 let kid_dict = kid_obj.as_dict()
184 .ok_or_else(|| ParseError::SyntaxError {
185 position: 0,
186 message: "Page tree node must be a dictionary".to_string(),
187 })?;
188
189 let kid_type = kid_dict.get_type()
190 .ok_or_else(|| ParseError::MissingKey("Type".to_string()))?;
191
192 let count = if kid_type == "Pages" {
193 kid_dict.get("Count")
194 .and_then(|obj| obj.as_integer())
195 .ok_or_else(|| ParseError::MissingKey("Count".to_string()))? as u32
196 } else {
197 1
198 };
199
200 if target_index < current_idx + count {
201 if kid_type == "Page" {
203 return self.create_parsed_page(kid_ref, kid_dict, Some(&merged_inherited));
205 } else {
206 return self.find_page_in_tree(
208 kid_dict,
209 target_index,
210 current_idx,
211 Some(&merged_inherited),
212 );
213 }
214 }
215
216 current_idx += count;
217 }
218
219 Err(ParseError::SyntaxError {
220 position: 0,
221 message: "Page not found in tree".to_string(),
222 })
223 }
224 "Page" => {
225 if target_index != current_index {
227 return Err(ParseError::SyntaxError {
228 position: 0,
229 message: "Page index mismatch".to_string(),
230 });
231 }
232
233 Err(ParseError::SyntaxError {
236 position: 0,
237 message: "Direct page object without reference".to_string(),
238 })
239 }
240 _ => Err(ParseError::SyntaxError {
241 position: 0,
242 message: format!("Invalid page tree node type: {}", node_type),
243 }),
244 }
245 }
246
247 fn create_parsed_page(
249 &self,
250 obj_ref: (u32, u16),
251 page_dict: &PdfDictionary,
252 inherited: Option<&PdfDictionary>,
253 ) -> ParseResult<ParsedPage> {
254 let media_box = self.get_rectangle(page_dict, inherited, "MediaBox")?
256 .ok_or_else(|| ParseError::MissingKey("MediaBox".to_string()))?;
257
258 let crop_box = self.get_rectangle(page_dict, inherited, "CropBox")?;
259
260 let rotation = self.get_integer(page_dict, inherited, "Rotate")?
261 .unwrap_or(0) as i32;
262
263 let inherited_resources = if let Some(inherited) = inherited {
265 inherited.get("Resources").and_then(|r| r.as_dict()).cloned()
266 } else {
267 None
268 };
269
270 Ok(ParsedPage {
271 obj_ref,
272 dict: page_dict.clone(),
273 inherited_resources,
274 media_box,
275 crop_box,
276 rotation,
277 })
278 }
279
280 fn get_rectangle(
282 &self,
283 node: &PdfDictionary,
284 inherited: Option<&PdfDictionary>,
285 key: &str,
286 ) -> ParseResult<Option<[f64; 4]>> {
287 let array = node.get(key)
288 .or_else(|| inherited.and_then(|i| i.get(key)));
289
290 if let Some(array) = array.and_then(|obj| obj.as_array()) {
291 if array.len() != 4 {
292 return Err(ParseError::SyntaxError {
293 position: 0,
294 message: format!("{} must have 4 elements", key),
295 });
296 }
297
298 let rect = [
299 array.get(0).unwrap().as_real().unwrap_or(0.0),
300 array.get(1).unwrap().as_real().unwrap_or(0.0),
301 array.get(2).unwrap().as_real().unwrap_or(0.0),
302 array.get(3).unwrap().as_real().unwrap_or(0.0),
303 ];
304
305 Ok(Some(rect))
306 } else {
307 Ok(None)
308 }
309 }
310
311 fn get_integer(
313 &self,
314 node: &PdfDictionary,
315 inherited: Option<&PdfDictionary>,
316 key: &str,
317 ) -> ParseResult<Option<i64>> {
318 let value = node.get(key)
319 .or_else(|| inherited.and_then(|i| i.get(key)));
320
321 Ok(value.and_then(|obj| obj.as_integer()))
322 }
323
324 pub fn get_object(&self, obj_num: u32, gen_num: u16) -> ParseResult<PdfObject> {
326 if let Some(obj) = self.resources.get_cached((obj_num, gen_num)) {
328 return Ok(obj);
329 }
330
331 let obj = {
333 let mut reader = self.reader.borrow_mut();
334 reader.get_object(obj_num, gen_num)?.clone()
335 };
336
337 self.resources.cache_object((obj_num, gen_num), obj.clone());
339
340 Ok(obj)
341 }
342
343 pub fn resolve(&self, obj: &PdfObject) -> ParseResult<PdfObject> {
345 match obj {
346 PdfObject::Reference(obj_num, gen_num) => {
347 self.get_object(*obj_num, *gen_num)
348 }
349 _ => Ok(obj.clone()),
350 }
351 }
352
353 pub fn get_page_content_streams(&self, page: &ParsedPage) -> ParseResult<Vec<Vec<u8>>> {
355 let mut streams = Vec::new();
356
357 if let Some(contents) = page.dict.get("Contents") {
358 let resolved_contents = self.resolve(contents)?;
359
360 match &resolved_contents {
361 PdfObject::Stream(stream) => {
362 streams.push(stream.decode()?);
363 }
364 PdfObject::Array(array) => {
365 for item in &array.0 {
366 let resolved = self.resolve(item)?;
367 if let PdfObject::Stream(stream) = resolved {
368 streams.push(stream.decode()?);
369 }
370 }
371 }
372 _ => return Err(ParseError::SyntaxError {
373 position: 0,
374 message: "Contents must be a stream or array of streams".to_string(),
375 }),
376 }
377 }
378
379 Ok(streams)
380 }
381
382 pub fn extract_text(&self) -> ParseResult<Vec<crate::text::ExtractedText>> {
384 let extractor = crate::text::TextExtractor::new();
385 extractor.extract_from_document(self)
386 }
387
388 pub fn extract_text_from_page(&self, page_index: u32) -> ParseResult<crate::text::ExtractedText> {
390 let extractor = crate::text::TextExtractor::new();
391 extractor.extract_from_page(self, page_index)
392 }
393
394 pub fn extract_text_with_options(&self, options: crate::text::ExtractionOptions) -> ParseResult<Vec<crate::text::ExtractedText>> {
396 let extractor = crate::text::TextExtractor::with_options(options);
397 extractor.extract_from_document(self)
398 }
399}