1use super::{ParseError, ParseResult};
6use super::header::PdfHeader;
7use super::xref::XRefTable;
8use super::trailer::PdfTrailer;
9use super::objects::{PdfObject, PdfDictionary};
10use super::object_stream::ObjectStream;
11use std::io::{Read, Seek, BufReader};
12use std::fs::File;
13use std::path::Path;
14use std::collections::HashMap;
15
16pub struct PdfReader<R: Read + Seek> {
18 reader: BufReader<R>,
19 header: PdfHeader,
20 xref: XRefTable,
21 trailer: PdfTrailer,
22 object_cache: HashMap<(u32, u16), PdfObject>,
24 object_stream_cache: HashMap<u32, ObjectStream>,
26 page_tree: Option<super::page_tree::PageTree>,
28}
29
30impl PdfReader<File> {
31 pub fn open<P: AsRef<Path>>(path: P) -> ParseResult<Self> {
33 let file = File::open(path)?;
34 Self::new(file)
35 }
36
37 pub fn open_document<P: AsRef<Path>>(path: P) -> ParseResult<super::document::PdfDocument<File>> {
39 let reader = Self::open(path)?;
40 Ok(reader.into_document())
41 }
42}
43
44impl<R: Read + Seek> PdfReader<R> {
45 pub fn new(reader: R) -> ParseResult<Self> {
47 let mut buf_reader = BufReader::new(reader);
48
49 let header = PdfHeader::parse(&mut buf_reader)?;
51 let xref = XRefTable::parse(&mut buf_reader)?;
53
54 let trailer_dict = xref.trailer()
56 .ok_or(ParseError::InvalidTrailer)?
57 .clone();
58
59 let xref_offset = 0; let trailer = PdfTrailer::from_dict(trailer_dict, xref_offset)?;
61
62 trailer.validate()?;
64
65 Ok(Self {
66 reader: buf_reader,
67 header,
68 xref,
69 trailer,
70 object_cache: HashMap::new(),
71 object_stream_cache: HashMap::new(),
72 page_tree: None,
73 })
74 }
75
76 pub fn version(&self) -> &super::header::PdfVersion {
78 &self.header.version
79 }
80
81 pub fn catalog(&mut self) -> ParseResult<&PdfDictionary> {
83 let (obj_num, gen_num) = self.trailer.root()?;
84 let catalog = self.get_object(obj_num, gen_num)?;
85
86 catalog.as_dict()
87 .ok_or_else(|| ParseError::SyntaxError {
88 position: 0,
89 message: "Catalog is not a dictionary".to_string(),
90 })
91 }
92
93 pub fn info(&mut self) -> ParseResult<Option<&PdfDictionary>> {
95 match self.trailer.info() {
96 Some((obj_num, gen_num)) => {
97 let info = self.get_object(obj_num, gen_num)?;
98 Ok(info.as_dict())
99 }
100 None => Ok(None),
101 }
102 }
103
104 pub fn get_object(&mut self, obj_num: u32, gen_num: u16) -> ParseResult<&PdfObject> {
106 let key = (obj_num, gen_num);
107
108 if self.object_cache.contains_key(&key) {
110 return Ok(&self.object_cache[&key]);
111 }
112
113 if let Some(ext_entry) = self.xref.get_extended_entry(obj_num) {
115 if let Some((stream_obj_num, index_in_stream)) = ext_entry.compressed_info {
116 return self.get_compressed_object(obj_num, gen_num, stream_obj_num, index_in_stream);
118 }
119 }
120
121 let entry = self.xref.get_entry(obj_num)
123 .ok_or_else(|| ParseError::InvalidReference(obj_num, gen_num))?;
124
125 if !entry.in_use {
126 self.object_cache.insert(key, PdfObject::Null);
128 return Ok(&self.object_cache[&key]);
129 }
130
131 if entry.generation != gen_num {
132 return Err(ParseError::InvalidReference(obj_num, gen_num));
133 }
134
135 self.reader.seek(std::io::SeekFrom::Start(entry.offset))?;
137
138 let mut lexer = super::lexer::Lexer::new(&mut self.reader);
140
141 let token = lexer.next_token()?;
143 let read_obj_num = match token {
144 super::lexer::Token::Integer(n) => n as u32,
145 _ => return Err(ParseError::SyntaxError {
146 position: entry.offset as usize,
147 message: "Expected object number".to_string(),
148 }),
149 };
150
151 if read_obj_num != obj_num {
152 return Err(ParseError::SyntaxError {
153 position: entry.offset as usize,
154 message: format!("Object number mismatch: expected {}, found {}", obj_num, read_obj_num),
155 });
156 }
157
158 let token = lexer.next_token()?;
160 let read_gen_num = match token {
161 super::lexer::Token::Integer(n) => n as u16,
162 _ => return Err(ParseError::SyntaxError {
163 position: entry.offset as usize,
164 message: "Expected generation number".to_string(),
165 }),
166 };
167
168 if read_gen_num != gen_num {
169 return Err(ParseError::SyntaxError {
170 position: entry.offset as usize,
171 message: format!("Generation number mismatch: expected {}, found {}", gen_num, read_gen_num),
172 });
173 }
174
175 let token = lexer.next_token()?;
177 match token {
178 super::lexer::Token::Obj => {},
179 _ => return Err(ParseError::SyntaxError {
180 position: entry.offset as usize,
181 message: "Expected 'obj' keyword".to_string(),
182 }),
183 };
184
185 let obj = PdfObject::parse(&mut lexer)?;
187
188 let token = lexer.next_token()?;
190 match token {
191 super::lexer::Token::EndObj => {},
192 _ => return Err(ParseError::SyntaxError {
193 position: entry.offset as usize,
194 message: "Expected 'endobj' keyword".to_string(),
195 }),
196 };
197
198 self.object_cache.insert(key, obj);
200 Ok(&self.object_cache[&key])
201 }
202
203 pub fn resolve<'a>(&'a mut self, obj: &'a PdfObject) -> ParseResult<&'a PdfObject> {
205 match obj {
206 PdfObject::Reference(obj_num, gen_num) => {
207 self.get_object(*obj_num, *gen_num)
208 }
209 _ => Ok(obj),
210 }
211 }
212
213 fn get_compressed_object(&mut self, obj_num: u32, gen_num: u16, stream_obj_num: u32, index_in_stream: u32) -> ParseResult<&PdfObject> {
215 let key = (obj_num, gen_num);
216
217 if !self.object_stream_cache.contains_key(&stream_obj_num) {
219 let stream_obj = self.get_object(stream_obj_num, 0)?;
221
222 if let Some(stream) = stream_obj.as_stream() {
223 let obj_stream = ObjectStream::parse(stream.clone())?;
225 self.object_stream_cache.insert(stream_obj_num, obj_stream);
226 } else {
227 return Err(ParseError::SyntaxError {
228 position: 0,
229 message: format!("Object {} is not a stream", stream_obj_num),
230 });
231 }
232 }
233
234 let obj_stream = &self.object_stream_cache[&stream_obj_num];
236 let obj = obj_stream.get_object(obj_num)
237 .ok_or_else(|| ParseError::SyntaxError {
238 position: 0,
239 message: format!("Object {} not found in object stream {}", obj_num, stream_obj_num),
240 })?;
241
242 self.object_cache.insert(key, obj.clone());
244 Ok(&self.object_cache[&key])
245 }
246
247 pub fn pages(&mut self) -> ParseResult<&PdfDictionary> {
249 let (pages_obj_num, pages_gen_num) = {
251 let catalog = self.catalog()?;
252 let pages_ref = catalog.get("Pages")
253 .ok_or_else(|| ParseError::MissingKey("Pages".to_string()))?;
254
255 match pages_ref {
256 PdfObject::Reference(obj_num, gen_num) => (*obj_num, *gen_num),
257 _ => return Err(ParseError::SyntaxError {
258 position: 0,
259 message: "Pages must be a reference".to_string(),
260 }),
261 }
262 };
263
264 let pages_obj = self.get_object(pages_obj_num, pages_gen_num)?;
266 pages_obj.as_dict()
267 .ok_or_else(|| ParseError::SyntaxError {
268 position: 0,
269 message: "Pages is not a dictionary".to_string(),
270 })
271 }
272
273 pub fn page_count(&mut self) -> ParseResult<u32> {
275 let pages = self.pages()?;
276 pages.get("Count")
277 .and_then(|obj| obj.as_integer())
278 .map(|count| count as u32)
279 .ok_or_else(|| ParseError::MissingKey("Count".to_string()))
280 }
281
282 pub fn metadata(&mut self) -> ParseResult<DocumentMetadata> {
284 let mut metadata = DocumentMetadata::default();
285
286 if let Some(info_dict) = self.info()? {
287 if let Some(title) = info_dict.get("Title").and_then(|o| o.as_string()) {
288 metadata.title = title.as_str().ok().map(|s| s.to_string());
289 }
290 if let Some(author) = info_dict.get("Author").and_then(|o| o.as_string()) {
291 metadata.author = author.as_str().ok().map(|s| s.to_string());
292 }
293 if let Some(subject) = info_dict.get("Subject").and_then(|o| o.as_string()) {
294 metadata.subject = subject.as_str().ok().map(|s| s.to_string());
295 }
296 if let Some(keywords) = info_dict.get("Keywords").and_then(|o| o.as_string()) {
297 metadata.keywords = keywords.as_str().ok().map(|s| s.to_string());
298 }
299 if let Some(creator) = info_dict.get("Creator").and_then(|o| o.as_string()) {
300 metadata.creator = creator.as_str().ok().map(|s| s.to_string());
301 }
302 if let Some(producer) = info_dict.get("Producer").and_then(|o| o.as_string()) {
303 metadata.producer = producer.as_str().ok().map(|s| s.to_string());
304 }
305 }
306
307 metadata.version = self.version().to_string();
308 metadata.page_count = self.page_count().ok();
309
310 Ok(metadata)
311 }
312
313 fn ensure_page_tree(&mut self) -> ParseResult<()> {
315 if self.page_tree.is_none() {
316 let page_count = self.page_count()?;
317 self.page_tree = Some(super::page_tree::PageTree::new(page_count));
318 }
319 Ok(())
320 }
321
322 pub fn get_page(&mut self, index: u32) -> ParseResult<&super::page_tree::ParsedPage> {
324 self.ensure_page_tree()?;
325
326 Err(ParseError::SyntaxError {
330 position: 0,
331 message: "get_page not implemented due to borrow checker constraints".to_string(),
332 })
333 }
334
335 pub fn get_all_pages(&mut self) -> ParseResult<Vec<super::page_tree::ParsedPage>> {
337 let page_count = self.page_count()?;
338 let mut pages = Vec::with_capacity(page_count as usize);
339
340 for i in 0..page_count {
341 let page = self.get_page(i)?.clone();
342 pages.push(page);
343 }
344
345 Ok(pages)
346 }
347
348 pub fn into_document(self) -> super::document::PdfDocument<R> {
350 super::document::PdfDocument::new(self)
351 }
352}
353
354#[derive(Debug, Default, Clone)]
356pub struct DocumentMetadata {
357 pub title: Option<String>,
358 pub author: Option<String>,
359 pub subject: Option<String>,
360 pub keywords: Option<String>,
361 pub creator: Option<String>,
362 pub producer: Option<String>,
363 pub creation_date: Option<String>,
364 pub modification_date: Option<String>,
365 pub version: String,
366 pub page_count: Option<u32>,
367}
368
369#[cfg(test)]
370mod tests {
371 use super::*;
372
373 #[test]
374 fn test_reader_construction() {
375 let pdf_data = b"%PDF-1.4
3771 0 obj
378<< /Type /Catalog /Pages 2 0 R >>
379endobj
3802 0 obj
381<< /Type /Pages /Kids [] /Count 0 >>
382endobj
383xref
3840 3
3850000000000 65535 f
3860000000009 00000 n
3870000000058 00000 n
388trailer
389<< /Size 3 /Root 1 0 R >>
390startxref
391116
392%%EOF";
393
394 }
397}