1use crate::bytes::{count_leading_line_endings, line_ending, literal_to_u64};
2use crate::constants::pdf_key::{START_XREF, TRAILER, XREF};
3use crate::error::Result;
4use crate::error::error_kind::{INVALID_PDF_FILE, NO_XREF_TABLE_FOUND};
5use crate::objects::{PDFNumber, PDFObject, XEntry};
6use crate::parser::{parse, parse_text_xref};
7use crate::sequence::{FileSequence, Sequence};
8use crate::tokenizer::Tokenizer;
9use crate::vpdf::PDFVersion;
10use log::debug;
11use std::path::PathBuf;
12use crate::constants::PREV;
13
14pub struct PDFDocument {
16 xrefs: Vec<XEntry>,
18 version: PDFVersion,
20 tokenizer: Tokenizer,
22}
23
24impl PDFDocument {
25 pub fn open(path: PathBuf) -> Result<PDFDocument> {
27 let file = std::fs::File::open(path)?;
28 let sequence = FileSequence::new(file);
29 Self::new(sequence)
30 }
31
32 pub fn new(mut sequence: impl Sequence + 'static) -> Result<PDFDocument> {
34 let version = parse_version(&mut sequence)?;
35 let offset = cal_xref_table_offset(&mut sequence)?;
36 let mut tokenizer = Tokenizer::new(sequence);
37 tokenizer.seek(offset)?;
38 let mut xrefs = Vec::<XEntry>::new();
39 merge_xref_table(&mut tokenizer,&mut xrefs)?;
41 let document = PDFDocument {
42 xrefs,
43 version,
44 tokenizer,
45 };
46 Ok(document)
47 }
48 pub fn get_xref_slice(&self) -> &[XEntry] {
50 &self.xrefs
51 }
52 pub fn get_version(&self) -> &PDFVersion {
54 &self.version
55 }
56 pub fn read_object(&mut self,index: usize) -> Result<Option<PDFObject>> {
58 if index >= self.xrefs.len() {
59 return Ok(None);
60 }
61 let entry = &self.xrefs[index];
62 if entry.is_freed() {
63 return Ok(None);
64 }
65 self.tokenizer.seek(entry.get_value())?;
66 let object = parse(&mut self.tokenizer)?;
67 Ok(Some(object))
68 }
69}
70
71fn parse_version(sequence: &mut impl Sequence) -> Result<PDFVersion> {
72 let mut buf = [0u8; 1024];
73 let n = sequence.read(&mut buf)?;
74 if n < 8 {
75 return Err(INVALID_PDF_FILE.into());
76 }
77 if buf.len() < 8
78 || buf[0] != b'%'
79 || buf[1] != b'P'
80 || buf[2] != b'D'
81 || buf[3] != b'F'
82 || buf[4] != b'-'
83 {
84 return Err(INVALID_PDF_FILE.into());
85 }
86 let version = String::from_utf8(buf[5..8].to_vec())?;
87 Ok(version.try_into()?)
88}
89
90fn merge_xref_table(mut tokenizer: &mut Tokenizer,mut xrefs: &mut Vec<XEntry>) -> Result<()> {
91 let is_xref = tokenizer.check_next_token0(false, |token| token.key_was(XREF))?;
92 if !is_xref {
93 return Err(NO_XREF_TABLE_FOUND.into());
94 }
95 let entries = parse_text_xref(tokenizer)?;
96 if xrefs.is_empty() {
97 xrefs.extend_from_slice(&entries);
98 } else {
99 for entry in entries {
100 if let None = xrefs.iter().find(|it| it.obj_num == entry.obj_num) {
101 xrefs.push(entry);
102 }
103 }
104 }
105 let is_trailer = tokenizer.check_next_token0(false, |token| token.key_was(TRAILER))?;
106 if is_trailer {
107 match parse(&mut tokenizer)?.as_dict() {
108 Some(dict) => {
109 match dict.get(PREV) {
110 Some(PDFObject::Number(PDFNumber::Unsigned(offset)))=>{
111 tokenizer.seek(*offset)?;
112 merge_xref_table(tokenizer,xrefs)?;
113 }
114 _ => {}
115 }
116 }
117 None => {}
118 }
119 }
120 Ok(())
121}
122fn cal_xref_table_offset(sequence: &mut impl Sequence) -> Result<u64> {
123 let size = sequence.size()?;
124 let pos = if size > 1024 { size - 1024 } else { 0 };
125 let mut buf = [0u8; 1024];
126 sequence.seek(pos)?;
127 let n = sequence.read(&mut buf)?;
128 let chars = START_XREF.as_bytes();
129 let mut tx = chars.len();
130 let mut index = n;
131 for i in (0..n).rev() {
132 let b = buf[i];
133 if chars[tx - 1] == b {
134 tx -= 1;
135 if tx == 0 {
136 index = i;
137 break;
138 }
139 }
140 }
141 if index == n {
143 return Err(INVALID_PDF_FILE.into());
144 }
145 index = index + chars.len();
146 let crlf_num = count_leading_line_endings(&buf[index..n]);
147 let start = index + (crlf_num as usize);
148 let mut end = 0usize;
149 for i in start..n {
150 if line_ending(buf[i]) {
151 end = i;
152 break;
153 }
154 }
155 if end == 0 || start == end {
156 debug!("Start-Xref offset not normal end");
157 return Err(INVALID_PDF_FILE.into());
158 }
159 let offset = literal_to_u64(&buf[start..end]);
160 Ok(offset)
161}