1use crate::catalog::{NodeId, OutlineTreeArean, PageTreeArean, decode_catalog_data, PageNode};
2use crate::constants::pdf_key::{START_XREF, XREF};
3use crate::constants::{
4 AUTHOR, CREATION_DATE, CREATOR, INFO, MOD_DATE, PREV, PRODUCER, ROOT, TITLE,
5};
6use crate::convert_glyph_from_dict;
7use crate::date::Date;
8use crate::encoding::PreDefinedEncoding;
9use crate::error::PDFError::{
10 InvalidPDFDocument, ObjectAttrMiss, PDFParseError, XrefTableNotFound,
11};
12use crate::error::Result;
13use crate::objects::{Dictionary, ObjRefTuple, PDFNumber, PDFObject, XEntry};
14use crate::parser::{parse, parse_text_xref, parse_with_offset};
15use crate::pstr::convert_glyph_text;
16use crate::sequence::{FileSequence, Sequence};
17use crate::tokenizer::Tokenizer;
18use crate::utils::{count_leading_line_endings, line_ending, literal_to_u64, xrefs_search};
19use crate::vpdf::PDFVersion;
20use std::path::PathBuf;
21use std::str::FromStr;
22
23pub struct PDFDescribe {
24 producer: Option<String>,
27 creator: Option<String>,
30 creation_date: Option<Date>,
34 author: Option<String>,
36 title: Option<String>,
38 mod_date: Option<Date>,
42}
43
44pub struct PDFDocument {
49 xrefs: Vec<XEntry>,
51 version: PDFVersion,
53 tokenizer: Tokenizer,
55 page_tree_arena: PageTreeArean,
57 outline_tree_arean: Option<OutlineTreeArean>,
59 describe: Option<PDFDescribe>,
61}
62
63impl PDFDocument {
64 pub fn open(path: PathBuf) -> Result<PDFDocument> {
77 let file = std::fs::File::open(path)?;
78 let sequence = FileSequence::new(file);
79 Self::new(sequence)
80 }
81
82 pub fn new(mut sequence: impl Sequence + 'static) -> Result<PDFDocument> {
95 let version = parse_version(&mut sequence)?;
96 let offset = cal_xref_table_offset(&mut sequence)?;
97 let mut tokenizer = Tokenizer::new(sequence);
98 tokenizer.seek(offset)?;
99 let (xrefs, catalog, info) = merge_xref_table(&mut tokenizer)?;
101 let (page_tree_arena, outline_tree_arean) = match catalog {
102 Some(catalog) => decode_catalog_data(&mut tokenizer, catalog, &xrefs)?,
103 None => return Err(ObjectAttrMiss("Trailer can't found catalog attr.")),
104 };
105 let mut describe = None;
106 if let Some(obj) = info {
108 let entry = xrefs_search(&xrefs, obj)?;
109 if let PDFObject::IndirectObject(_, _, value) =
110 parse_with_offset(&mut tokenizer, entry.value)?
111 {
112 if let PDFObject::Dict(dict) = *value {
113 describe = Some(PDFDescribe::new(dict));
114 }
115 }
116 }
117 let document = PDFDocument {
118 xrefs,
119 version,
120 tokenizer,
121 page_tree_arena,
122 outline_tree_arean,
123 describe,
124 };
125 Ok(document)
126 }
127
128 pub fn get_xref_slice(&self) -> &[XEntry] {
134 &self.xrefs
135 }
136
137 pub fn find_xref_index<F>(&self, visit: F) -> Option<usize>
147 where
148 F: Fn(&XEntry) -> bool,
149 {
150 self.xrefs.iter().position(visit)
151 }
152
153 pub fn get_version(&self) -> &PDFVersion {
159 &self.version
160 }
161
162 pub fn read_object(&mut self, index: usize) -> Result<Option<PDFObject>> {
173 if index >= self.xrefs.len() {
174 return Ok(None);
175 }
176 let entry = &self.xrefs[index];
177 if entry.is_freed() {
178 return Ok(None);
179 }
180 self.tokenizer.seek(entry.get_value())?;
181 let object = parse(&mut self.tokenizer)?;
182 Ok(Some(object))
183 }
184
185 pub fn read_object_with_ref(&mut self, tuple: ObjRefTuple) -> Result<Option<PDFObject>> {
186 self.xrefs
187 .iter()
188 .position(|entry| entry.obj_num == tuple.0 && entry.gen_num == tuple.1)
189 .map(|index| self.read_object(index))
190 .unwrap_or(Ok(None))
191 }
192
193 pub fn get_page_num(&self) -> usize {
194 self.page_tree_arena.get_page_num()
195 }
196
197 pub fn get_page_ids(&self) -> Vec<NodeId> {
198 self.page_tree_arena.get_leaf_page_ids()
199 }
200
201 pub fn get_page(&self, node_id: NodeId) -> Option<&PageNode> {
202 self.page_tree_arena.get_page_node(node_id)
203 }
204}
205
206fn parse_version(sequence: &mut impl Sequence) -> Result<PDFVersion> {
220 let mut buf = [0u8; 1024];
221 let n = sequence.read(&mut buf)?;
222 if n < 8 {
223 return Err(InvalidPDFDocument);
224 }
225 if buf.len() < 8 || !buf.starts_with(b"%PDF-") {
226 return Err(InvalidPDFDocument);
227 }
228 let version = String::from_utf8(buf[5..8].to_vec())?;
229 Ok(version.try_into()?)
230}
231
232fn merge_xref_table(
248 mut tokenizer: &mut Tokenizer,
249) -> Result<(Vec<XEntry>, Option<(u32, u16)>, Option<(u32, u16)>)> {
250 let mut xrefs = Vec::<XEntry>::new();
251 let mut info = None;
252 let mut catalog = None;
253 loop {
254 let is_xref = tokenizer.check_next_token0(false, |token| token.key_was(XREF))?;
255 if !is_xref {
256 return Err(XrefTableNotFound);
257 }
258 let entries = parse_text_xref(tokenizer)?;
259 if xrefs.is_empty() {
260 xrefs.extend_from_slice(&entries);
261 } else {
262 for entry in entries {
263 if let None = xrefs.iter().find(|it| it.obj_num == entry.obj_num) {
264 xrefs.push(entry);
265 }
266 }
267 }
268 if let PDFObject::Dict(dictionary) = parse(&mut tokenizer)? {
269 if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = dictionary.get(ROOT) {
270 catalog = Some((*obj_num, *gen_num));
271 if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = dictionary.get(INFO) {
272 info = Some((*obj_num, *gen_num));
273 }
274 }
275 if let Some(PDFObject::Number(PDFNumber::Unsigned(prev))) = dictionary.get(PREV) {
277 tokenizer.seek(*prev)?;
278 continue;
279 }
280 return Ok((xrefs, catalog, info));
281 }
282 return Err(PDFParseError("Xref table broken."));
283 }
284}
285
286fn cal_xref_table_offset(sequence: &mut impl Sequence) -> Result<u64> {
300 let size = sequence.size()?;
301 let pos = if size > 1024 { size - 1024 } else { 0 };
302 let mut buf = [0u8; 1024];
303 sequence.seek(pos)?;
304 let n = sequence.read(&mut buf)?;
305 let chars = START_XREF.as_bytes();
306 let mut tx = chars.len();
307 let mut index = n;
308 for i in (0..n).rev() {
309 let b = buf[i];
310 if chars[tx - 1] == b {
311 tx -= 1;
312 if tx == 0 {
313 index = i;
314 break;
315 }
316 }
317 }
318 if index == n {
320 return Err(InvalidPDFDocument);
321 }
322 index = index + chars.len();
323 let crlf_num = count_leading_line_endings(&buf[index..n]);
324 let start = index + (crlf_num as usize);
325 let mut end = 0usize;
326 for i in start..n {
327 if line_ending(buf[i]) {
328 end = i;
329 break;
330 }
331 }
332 if end == 0 || start == end {
333 return Err(InvalidPDFDocument);
334 }
335 let offset = literal_to_u64(&buf[start..end]);
336 Ok(offset)
337}
338
339impl PDFDescribe {
340 pub(crate) fn new(dictionary: Dictionary) -> PDFDescribe {
341 let encoding = PreDefinedEncoding::PDFDoc;
342 let producer = convert_glyph_from_dict!(dictionary, PRODUCER, &encoding);
343 let creator = convert_glyph_from_dict!(dictionary, CREATOR, &encoding);
344 let creation_date =
345 convert_glyph_from_dict!(dictionary, CREATION_DATE, &encoding).map_or(None, |text| {
346 match Date::from_str(text.as_str()) {
347 Ok(date) => Some(date),
348 Err(_) => None,
349 }
350 });
351 let mod_date =
352 convert_glyph_from_dict!(dictionary, MOD_DATE, &encoding).map_or(None, |text| {
353 match Date::from_str(text.as_str()) {
354 Ok(date) => Some(date),
355 Err(_) => None,
356 }
357 });
358 let author = convert_glyph_from_dict!(dictionary, AUTHOR, &encoding);
359 let title = convert_glyph_from_dict!(dictionary, TITLE, &encoding);
360 PDFDescribe {
361 producer,
362 creator,
363 creation_date,
364 author,
365 title,
366 mod_date,
367 }
368 }
369}