1use crate::constants::pdf_key::{START_XREF, TRAILER, XREF};
2use crate::error::error_kind::INVALID_PDF_FILE;
3use crate::error::Result;
4use crate::objects::{PDFObject, XEntry};
5use crate::parser::parse;
6use crate::sequence::{FileSequence, Sequence};
7use crate::tokenizer::Tokenizer;
8use crate::vpdf::PDFVersion;
9use std::path::PathBuf;
10use log::debug;
11use crate::bytes::{count_leading_line_endings, line_ending, literal_to_u64};
12
13pub struct PDFDocument {
15 xrefs: Vec<XEntry>,
17 version: PDFVersion,
19 tokenizer: Tokenizer
21}
22
23impl PDFDocument {
24
25 pub fn open(path: PathBuf) -> Result<PDFDocument> {
27 let file = std::fs::File::open(path)?;
28 let sequence = FileSequence::new(file);
29 Self::new(sequence)
30 }
31
32 pub fn new(mut sequence: impl Sequence + 'static) -> Result<PDFDocument> {
34 let version = parse_version(&mut sequence)?;
35 let offset = cal_xref_table_offset(&mut sequence)?;
36 let mut tokenizer = Tokenizer::new(sequence);
37 tokenizer.seek(offset)?;
38 let xrefs = parse_xref(&mut tokenizer)?;
39 let document = PDFDocument {
40 xrefs,
41 version,
42 tokenizer,
43 };
44 Ok(document)
45 }
46 pub fn get_xref(&self) -> &Vec<XEntry> {
47 &self.xrefs
48 }
49 pub fn get_version(&self) -> &PDFVersion {
50 &self.version
51 }
52}
53
54fn parse_version(sequence: &mut impl Sequence) -> Result<PDFVersion> {
55 let mut buf = [0u8; 1024];
56 let n = sequence.read(&mut buf)?;
57 if n < 8 {
58 return Err(INVALID_PDF_FILE.into());
59 }
60 if buf.len() < 8
61 || buf[0] != 37
62 || buf[1] != 80
63 || buf[2] != 68
64 || buf[3] != 70
65 || buf[4] != 45
66 {
67 return Err(INVALID_PDF_FILE.into());
68 }
69 let version = String::from_utf8(buf[5..8].to_vec())?;
70 Ok(version.try_into()?)
71}
72
73fn parse_xref(mut tokenizer: &mut Tokenizer) -> Result<Vec<XEntry>> {
74 if let Some(PDFObject::Xref(entries)) = parse(&mut tokenizer, |token| token.key_was(XREF))? {
75 if let Some(PDFObject::Dict(dict)) = parse(&mut tokenizer, |token| token.key_was(TRAILER))? {
76 return Ok(entries)
77 }
78 }
79 Ok(vec![])
80}
81
82fn cal_xref_table_offset(sequence: &mut impl Sequence) -> Result<u64> {
83 let size = sequence.size()?;
84 let pos = if size > 1024 { size - 1024 } else { 0 };
85 let mut buf = [0u8; 1024];
86 sequence.seek(pos)?;
87 let n = sequence.read(&mut buf)?;
88 let chars = START_XREF.as_bytes();
89 let mut tx = chars.len();
90 let mut index = n;
91 for i in (0..n).rev() {
92 let b = buf[i];
93 if chars[tx - 1] == b {
94 tx -= 1;
95 if tx == 0 {
96 index = i;
97 break
98 }
99 }
100 }
101 if index == n {
103 return Err(INVALID_PDF_FILE.into())
104 }
105 index = index + chars.len();
106 let crlf_num = count_leading_line_endings(&buf[index..n]);
107 let start = index + (crlf_num as usize);
108 let mut end = 0usize;
109 for i in start..n {
110 if line_ending(buf[i]) {
111 end = i;
112 break;
113 }
114 }
115 if end == 0 || start == end {
116 debug!("Start-Xref offset not normal end");
117 return Err(INVALID_PDF_FILE.into())
118 }
119 let offset = literal_to_u64(&buf[start..end]);
120 Ok(offset)
121}