1use crate::bytes::{count_leading_line_endings, line_ending, literal_to_u64};
2use crate::constants::pdf_key::{START_XREF, TRAILER, XREF};
3use crate::error::Result;
4use crate::error::error_kind::{INVALID_PDF_FILE, NO_XREF_TABLE_FOUND};
5use crate::objects::{PDFNumber, PDFObject, XEntry};
6use crate::parser::{parse, parse_text_xref};
7use crate::sequence::{FileSequence, Sequence};
8use crate::tokenizer::Tokenizer;
9use crate::vpdf::PDFVersion;
10use log::debug;
11use std::path::PathBuf;
12use crate::constants::PREV;
13
14pub struct PDFDocument {
16 xrefs: Vec<XEntry>,
18 version: PDFVersion,
20 tokenizer: Tokenizer,
22}
23
24impl PDFDocument {
25 pub fn open(path: PathBuf) -> Result<PDFDocument> {
27 let file = std::fs::File::open(path)?;
28 let sequence = FileSequence::new(file);
29 Self::new(sequence)
30 }
31
32 pub fn new(mut sequence: impl Sequence + 'static) -> Result<PDFDocument> {
34 let version = parse_version(&mut sequence)?;
35 let offset = cal_xref_table_offset(&mut sequence)?;
36 let mut tokenizer = Tokenizer::new(sequence);
37 tokenizer.seek(offset)?;
38 let mut xrefs = Vec::<XEntry>::new();
39 merge_xref_table(&mut tokenizer,&mut xrefs)?;
41 let document = PDFDocument {
42 xrefs,
43 version,
44 tokenizer,
45 };
46 Ok(document)
47 }
48 pub fn get_xref_slice(&self) -> &[XEntry] {
50 &self.xrefs
51 }
52 pub fn find_xref_index<F>(&self, visit: F) -> Option<usize>
54 where
55 F: Fn(&XEntry) -> bool,
56 {
57 self.xrefs.iter().position(visit)
58 }
59 pub fn get_version(&self) -> &PDFVersion {
61 &self.version
62 }
63 pub fn read_object(&mut self,index: usize) -> Result<Option<PDFObject>> {
65 if index >= self.xrefs.len() {
66 return Ok(None);
67 }
68 let entry = &self.xrefs[index];
69 if entry.is_freed() {
70 return Ok(None);
71 }
72 self.tokenizer.seek(entry.get_value())?;
73 let object = parse(&mut self.tokenizer)?;
74 Ok(Some(object))
75 }
76}
77
78fn parse_version(sequence: &mut impl Sequence) -> Result<PDFVersion> {
79 let mut buf = [0u8; 1024];
80 let n = sequence.read(&mut buf)?;
81 if n < 8 {
82 return Err(INVALID_PDF_FILE.into());
83 }
84 if buf.len() < 8
85 || buf[0] != b'%'
86 || buf[1] != b'P'
87 || buf[2] != b'D'
88 || buf[3] != b'F'
89 || buf[4] != b'-'
90 {
91 return Err(INVALID_PDF_FILE.into());
92 }
93 let version = String::from_utf8(buf[5..8].to_vec())?;
94 Ok(version.try_into()?)
95}
96
97fn merge_xref_table(mut tokenizer: &mut Tokenizer,mut xrefs: &mut Vec<XEntry>) -> Result<()> {
98 let is_xref = tokenizer.check_next_token0(false, |token| token.key_was(XREF))?;
99 if !is_xref {
100 return Err(NO_XREF_TABLE_FOUND.into());
101 }
102 let entries = parse_text_xref(tokenizer)?;
103 if xrefs.is_empty() {
104 xrefs.extend_from_slice(&entries);
105 } else {
106 for entry in entries {
107 if let None = xrefs.iter().find(|it| it.obj_num == entry.obj_num) {
108 xrefs.push(entry);
109 }
110 }
111 }
112 let is_trailer = tokenizer.check_next_token0(false, |token| token.key_was(TRAILER))?;
113 if is_trailer {
114 match parse(&mut tokenizer)?.as_dict() {
115 Some(dict) => {
116 match dict.get(PREV) {
117 Some(PDFObject::Number(PDFNumber::Unsigned(offset)))=>{
118 tokenizer.seek(*offset)?;
119 merge_xref_table(tokenizer,xrefs)?;
120 }
121 _ => {}
122 }
123 }
124 None => {}
125 }
126 }
127 Ok(())
128}
129fn cal_xref_table_offset(sequence: &mut impl Sequence) -> Result<u64> {
130 let size = sequence.size()?;
131 let pos = if size > 1024 { size - 1024 } else { 0 };
132 let mut buf = [0u8; 1024];
133 sequence.seek(pos)?;
134 let n = sequence.read(&mut buf)?;
135 let chars = START_XREF.as_bytes();
136 let mut tx = chars.len();
137 let mut index = n;
138 for i in (0..n).rev() {
139 let b = buf[i];
140 if chars[tx - 1] == b {
141 tx -= 1;
142 if tx == 0 {
143 index = i;
144 break;
145 }
146 }
147 }
148 if index == n {
150 return Err(INVALID_PDF_FILE.into());
151 }
152 index = index + chars.len();
153 let crlf_num = count_leading_line_endings(&buf[index..n]);
154 let start = index + (crlf_num as usize);
155 let mut end = 0usize;
156 for i in start..n {
157 if line_ending(buf[i]) {
158 end = i;
159 break;
160 }
161 }
162 if end == 0 || start == end {
163 debug!("Start-Xref offset not normal end");
164 return Err(INVALID_PDF_FILE.into());
165 }
166 let offset = literal_to_u64(&buf[start..end]);
167 Ok(offset)
168}