1use crate::utils::{count_leading_line_endings, line_ending, literal_to_u64};
2use crate::constants::pdf_key::{START_XREF, TRAILER, XREF};
3use crate::constants::{PREV, ROOT};
4use crate::error::error_kind::{
5 CANT_FIND_ROOT, EXCEPT_TRAILER, INVALID_PDF_FILE, NO_XREF_TABLE_FOUND,
6};
7use crate::error::{Result};
8use crate::objects::{PDFNumber, PDFObject, XEntry};
9use crate::parser::{parse, parse_text_xref};
10use crate::sequence::{FileSequence, Sequence};
11use crate::tokenizer::Tokenizer;
12use crate::vpdf::PDFVersion;
13use log::debug;
14use std::path::PathBuf;
15use crate::catalog::{create_page_tree_arena, PageTreeArean};
16
17pub struct PDFDocument {
19 xrefs: Vec<XEntry>,
21 version: PDFVersion,
23 tokenizer: Tokenizer,
25 page_tree_arena: PageTreeArean
27}
28
29impl PDFDocument {
30 pub fn open(path: PathBuf) -> Result<PDFDocument> {
32 let file = std::fs::File::open(path)?;
33 let sequence = FileSequence::new(file);
34 Self::new(sequence)
35 }
36
37 pub fn new(mut sequence: impl Sequence + 'static) -> Result<PDFDocument> {
39 let version = parse_version(&mut sequence)?;
40 let offset = cal_xref_table_offset(&mut sequence)?;
41 let mut tokenizer = Tokenizer::new(sequence);
42 tokenizer.seek(offset)?;
43 let (xrefs, catalog) = merge_xref_table(&mut tokenizer)?;
45 let page_tree_arena = create_page_tree_arena(&mut tokenizer,catalog,&xrefs)?;
46 let document = PDFDocument {
47 xrefs,
48 version,
49 tokenizer,
50 page_tree_arena
51 };
52 Ok(document)
53 }
54 pub fn get_xref_slice(&self) -> &[XEntry] {
56 &self.xrefs
57 }
58 pub fn find_xref_index<F>(&self, visit: F) -> Option<usize>
60 where
61 F: Fn(&XEntry) -> bool,
62 {
63 self.xrefs.iter().position(visit)
64 }
65 pub fn get_version(&self) -> &PDFVersion {
67 &self.version
68 }
69 pub fn read_object(&mut self, index: usize) -> Result<Option<PDFObject>> {
71 if index >= self.xrefs.len() {
72 return Ok(None);
73 }
74 let entry = &self.xrefs[index];
75 if entry.is_freed() {
76 return Ok(None);
77 }
78 self.tokenizer.seek(entry.get_value())?;
79 let object = parse(&mut self.tokenizer)?;
80 Ok(Some(object))
81 }
82
83 pub fn get_page_num(&self) -> usize {
85 self.page_tree_arena.get_page_num()
86 }
87}
88
89fn parse_version(sequence: &mut impl Sequence) -> Result<PDFVersion> {
90 let mut buf = [0u8; 1024];
91 let n = sequence.read(&mut buf)?;
92 if n < 8 {
93 return Err(INVALID_PDF_FILE.into());
94 }
95 if buf.len() < 8
96 || buf[0] != b'%'
97 || buf[1] != b'P'
98 || buf[2] != b'D'
99 || buf[3] != b'F'
100 || buf[4] != b'-'
101 {
102 return Err(INVALID_PDF_FILE.into());
103 }
104 let version = String::from_utf8(buf[5..8].to_vec())?;
105 Ok(version.try_into()?)
106}
107
108fn merge_xref_table(mut tokenizer: &mut Tokenizer) -> Result<(Vec<XEntry>, (u64, u64))> {
109 let mut xrefs = Vec::<XEntry>::new();
110 let mut root = None;
111 loop {
112 let is_xref = tokenizer.check_next_token0(false, |token| token.key_was(XREF))?;
113 if !is_xref {
114 return Err(NO_XREF_TABLE_FOUND.into());
115 }
116 let entries = parse_text_xref(tokenizer)?;
117 if xrefs.is_empty() {
118 xrefs.extend_from_slice(&entries);
119 } else {
120 for entry in entries {
121 if let None = xrefs.iter().find(|it| it.obj_num == entry.obj_num) {
122 xrefs.push(entry);
123 }
124 }
125 }
126 if let PDFObject::Dict(mut dictionary) = parse(&mut tokenizer)? {
127 if let Some(obj) = dictionary.remove(ROOT) {
128 root = Some(obj);
129 }
130 if let Some(PDFObject::Number(PDFNumber::Unsigned(prev))) = dictionary.get(PREV) {
131 tokenizer.seek(*prev)?;
132 continue;
133 }
134 if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = root {
135 return Ok((xrefs, (obj_num, gen_num)));
136 }
137 return Err(CANT_FIND_ROOT.into());
138 }
139 return Err(EXCEPT_TRAILER.into());
140 }
141}
142fn cal_xref_table_offset(sequence: &mut impl Sequence) -> Result<u64> {
143 let size = sequence.size()?;
144 let pos = if size > 1024 { size - 1024 } else { 0 };
145 let mut buf = [0u8; 1024];
146 sequence.seek(pos)?;
147 let n = sequence.read(&mut buf)?;
148 let chars = START_XREF.as_bytes();
149 let mut tx = chars.len();
150 let mut index = n;
151 for i in (0..n).rev() {
152 let b = buf[i];
153 if chars[tx - 1] == b {
154 tx -= 1;
155 if tx == 0 {
156 index = i;
157 break;
158 }
159 }
160 }
161 if index == n {
163 return Err(INVALID_PDF_FILE.into());
164 }
165 index = index + chars.len();
166 let crlf_num = count_leading_line_endings(&buf[index..n]);
167 let start = index + (crlf_num as usize);
168 let mut end = 0usize;
169 for i in start..n {
170 if line_ending(buf[i]) {
171 end = i;
172 break;
173 }
174 }
175 if end == 0 || start == end {
176 debug!("Start-Xref offset not normal end");
177 return Err(INVALID_PDF_FILE.into());
178 }
179 let offset = literal_to_u64(&buf[start..end]);
180 Ok(offset)
181}