1use crate::catalog::{create_page_tree_arena, PageTreeArean};
2use crate::constants::pdf_key::{START_XREF, XREF};
3use crate::constants::{PREV, ROOT};
4use crate::error::PDFError::{InvalidPDFDocument, PDFParseError, XrefTableNotFound};
5use crate::error::Result;
6use crate::objects::{PDFNumber, PDFObject, XEntry};
7use crate::parser::{parse, parse_text_xref};
8use crate::sequence::{FileSequence, Sequence};
9use crate::tokenizer::Tokenizer;
10use crate::utils::{count_leading_line_endings, line_ending, literal_to_u64};
11use std::path::PathBuf;
12use crate::vpdf::PDFVersion;
13
14pub struct PDFDocument {
16 xrefs: Vec<XEntry>,
18 version: PDFVersion,
20 tokenizer: Tokenizer,
22 page_tree_arena: PageTreeArean
24}
25
26impl PDFDocument {
27 pub fn open(path: PathBuf) -> Result<PDFDocument> {
29 let file = std::fs::File::open(path)?;
30 let sequence = FileSequence::new(file);
31 Self::new(sequence)
32 }
33
34 pub fn new(mut sequence: impl Sequence + 'static) -> Result<PDFDocument> {
36 let version = parse_version(&mut sequence)?;
37 let offset = cal_xref_table_offset(&mut sequence)?;
38 let mut tokenizer = Tokenizer::new(sequence);
39 tokenizer.seek(offset)?;
40 let (xrefs, catalog) = merge_xref_table(&mut tokenizer)?;
42 let page_tree_arena = create_page_tree_arena(&mut tokenizer,catalog,&xrefs)?;
43 let document = PDFDocument {
44 xrefs,
45 version,
46 tokenizer,
47 page_tree_arena
48 };
49 Ok(document)
50 }
51 pub fn get_xref_slice(&self) -> &[XEntry] {
53 &self.xrefs
54 }
55 pub fn find_xref_index<F>(&self, visit: F) -> Option<usize>
57 where
58 F: Fn(&XEntry) -> bool,
59 {
60 self.xrefs.iter().position(visit)
61 }
62 pub fn get_version(&self) -> &PDFVersion {
64 &self.version
65 }
66 pub fn read_object(&mut self, index: usize) -> Result<Option<PDFObject>> {
68 if index >= self.xrefs.len() {
69 return Ok(None);
70 }
71 let entry = &self.xrefs[index];
72 if entry.is_freed() {
73 return Ok(None);
74 }
75 self.tokenizer.seek(entry.get_value())?;
76 let object = parse(&mut self.tokenizer)?;
77 Ok(Some(object))
78 }
79
80 pub fn get_page_num(&self) -> usize {
82 self.page_tree_arena.get_page_num()
83 }
84}
85
86fn parse_version(sequence: &mut impl Sequence) -> Result<PDFVersion> {
87 let mut buf = [0u8; 1024];
88 let n = sequence.read(&mut buf)?;
89 if n < 8 {
90 return Err(InvalidPDFDocument);
91 }
92 if buf.len() < 8
93 || buf[0] != b'%'
94 || buf[1] != b'P'
95 || buf[2] != b'D'
96 || buf[3] != b'F'
97 || buf[4] != b'-'
98 {
99 return Err(InvalidPDFDocument);
100 }
101 let version = String::from_utf8(buf[5..8].to_vec())?;
102 Ok(version.try_into()?)
103}
104
105fn merge_xref_table(mut tokenizer: &mut Tokenizer) -> Result<(Vec<XEntry>, (u64, u64))> {
106 let mut xrefs = Vec::<XEntry>::new();
107 let mut root = None;
108 loop {
109 let is_xref = tokenizer.check_next_token0(false, |token| token.key_was(XREF))?;
110 if !is_xref {
111 return Err(XrefTableNotFound);
112 }
113 let entries = parse_text_xref(tokenizer)?;
114 if xrefs.is_empty() {
115 xrefs.extend_from_slice(&entries);
116 } else {
117 for entry in entries {
118 if let None = xrefs.iter().find(|it| it.obj_num == entry.obj_num) {
119 xrefs.push(entry);
120 }
121 }
122 }
123 if let PDFObject::Dict(mut dictionary) = parse(&mut tokenizer)? {
124 if let Some(obj) = dictionary.remove(ROOT) {
125 root = Some(obj);
126 }
127 if let Some(PDFObject::Number(PDFNumber::Unsigned(prev))) = dictionary.get(PREV) {
129 tokenizer.seek(*prev)?;
130 continue;
131 }
132 if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = root {
133 return Ok((xrefs, (obj_num, gen_num)));
134 }
135 }
136 return Err(PDFParseError("Xref table broken."));
137 }
138}
139fn cal_xref_table_offset(sequence: &mut impl Sequence) -> Result<u64> {
140 let size = sequence.size()?;
141 let pos = if size > 1024 { size - 1024 } else { 0 };
142 let mut buf = [0u8; 1024];
143 sequence.seek(pos)?;
144 let n = sequence.read(&mut buf)?;
145 let chars = START_XREF.as_bytes();
146 let mut tx = chars.len();
147 let mut index = n;
148 for i in (0..n).rev() {
149 let b = buf[i];
150 if chars[tx - 1] == b {
151 tx -= 1;
152 if tx == 0 {
153 index = i;
154 break;
155 }
156 }
157 }
158 if index == n {
160 return Err(InvalidPDFDocument);
161 }
162 index = index + chars.len();
163 let crlf_num = count_leading_line_endings(&buf[index..n]);
164 let start = index + (crlf_num as usize);
165 let mut end = 0usize;
166 for i in start..n {
167 if line_ending(buf[i]) {
168 end = i;
169 break;
170 }
171 }
172 if end == 0 || start == end {
173 return Err(InvalidPDFDocument);
174 }
175 let offset = literal_to_u64(&buf[start..end]);
176 Ok(offset)
177}