pdf_rs/
document.rs

1use crate::bytes::{count_leading_line_endings, line_ending, literal_to_u64};
2use crate::constants::pdf_key::{START_XREF, TRAILER, XREF};
3use crate::error::Result;
4use crate::error::error_kind::{INVALID_PDF_FILE, NO_XREF_TABLE_FOUND};
5use crate::objects::{PDFNumber, PDFObject, XEntry};
6use crate::parser::{parse, parse_text_xref};
7use crate::sequence::{FileSequence, Sequence};
8use crate::tokenizer::Tokenizer;
9use crate::vpdf::PDFVersion;
10use log::debug;
11use std::path::PathBuf;
12use crate::constants::PREV;
13
14/// Represent a PDF document
15pub struct PDFDocument {
16    /// Cross-reference table
17    xrefs: Vec<XEntry>,
18    /// PDF version
19    version: PDFVersion,
20    // Tokenizer
21    tokenizer: Tokenizer,
22}
23
24impl PDFDocument {
25    /// Open a pdf document
26    pub  fn open(path: PathBuf) -> Result<PDFDocument> {
27        let file = std::fs::File::open(path)?;
28        let sequence = FileSequence::new(file);
29        Self::new(sequence)
30    }
31
32    /// Create a pdf document from sequence
33    pub fn new(mut sequence: impl Sequence + 'static) -> Result<PDFDocument> {
34        let version = parse_version(&mut sequence)?;
35        let offset = cal_xref_table_offset(&mut sequence)?;
36        let mut tokenizer = Tokenizer::new(sequence);
37        tokenizer.seek(offset)?;
38        let mut xrefs = Vec::<XEntry>::new();
39        // Merge all xref table
40        merge_xref_table(&mut tokenizer,&mut xrefs)?;
41        let document = PDFDocument {
42            xrefs,
43            version,
44            tokenizer,
45        };
46        Ok(document)
47    }
48    /// Get xref slice
49    pub fn get_xref_slice(&self) -> &[XEntry] {
50        &self.xrefs
51    }
52    /// Find xref index
53    pub fn find_xref_index<F>(&self, visit: F) -> Option<usize>
54    where
55        F: Fn(&XEntry) -> bool,
56    {
57        self.xrefs.iter().position(visit)
58    }
59    /// Get PDF version
60    pub fn get_version(&self) -> &PDFVersion {
61        &self.version
62    }
63    /// Read object from PDFDocument
64    pub fn read_object(&mut self,index: usize) -> Result<Option<PDFObject>> {
65        if index >= self.xrefs.len() {
66            return Ok(None);
67        }
68        let entry = &self.xrefs[index];
69        if entry.is_freed() {
70            return Ok(None);
71        }
72        self.tokenizer.seek(entry.get_value())?;
73        let object = parse(&mut self.tokenizer)?;
74        Ok(Some(object))
75    }
76}
77
78fn parse_version(sequence: &mut impl Sequence) -> Result<PDFVersion> {
79    let mut buf = [0u8; 1024];
80    let n = sequence.read(&mut buf)?;
81    if n < 8 {
82        return Err(INVALID_PDF_FILE.into());
83    }
84    if buf.len() < 8
85        || buf[0] != b'%'
86        || buf[1] != b'P'
87        || buf[2] != b'D'
88        || buf[3] != b'F'
89        || buf[4] != b'-'
90    {
91        return Err(INVALID_PDF_FILE.into());
92    }
93    let version = String::from_utf8(buf[5..8].to_vec())?;
94    Ok(version.try_into()?)
95}
96
97fn merge_xref_table(mut tokenizer: &mut Tokenizer,mut xrefs: &mut Vec<XEntry>) -> Result<()> {
98    let is_xref = tokenizer.check_next_token0(false, |token| token.key_was(XREF))?;
99    if !is_xref {
100        return Err(NO_XREF_TABLE_FOUND.into());
101    }
102    let entries = parse_text_xref(tokenizer)?;
103    if xrefs.is_empty() {
104        xrefs.extend_from_slice(&entries);
105    } else {
106        for entry in entries {
107            if let None = xrefs.iter().find(|it| it.obj_num == entry.obj_num) {
108                xrefs.push(entry);
109            }
110        }
111    }
112    let is_trailer = tokenizer.check_next_token0(false, |token| token.key_was(TRAILER))?;
113    if is_trailer {
114        match parse(&mut tokenizer)?.as_dict() {
115            Some(dict) => {
116                match dict.get(PREV) {
117                    Some(PDFObject::Number(PDFNumber::Unsigned(offset)))=>{
118                        tokenizer.seek(*offset)?;
119                        merge_xref_table(tokenizer,xrefs)?;
120                    }
121                    _ => {}
122                }
123            }
124            None => {}
125        }
126    }
127    Ok(())
128}
129fn cal_xref_table_offset(sequence: &mut impl Sequence) -> Result<u64> {
130    let size = sequence.size()?;
131    let pos = if size > 1024 { size - 1024 } else { 0 };
132    let mut buf = [0u8; 1024];
133    sequence.seek(pos)?;
134    let n = sequence.read(&mut buf)?;
135    let chars = START_XREF.as_bytes();
136    let mut tx = chars.len();
137    let mut index = n;
138    for i in (0..n).rev() {
139        let b = buf[i];
140        if chars[tx - 1] == b {
141            tx -= 1;
142            if tx == 0 {
143                index = i;
144                break;
145            }
146        }
147    }
148    // Can't find start xref
149    if index == n {
150        return Err(INVALID_PDF_FILE.into());
151    }
152    index = index + chars.len();
153    let crlf_num = count_leading_line_endings(&buf[index..n]);
154    let start = index + (crlf_num as usize);
155    let mut end = 0usize;
156    for i in start..n {
157        if line_ending(buf[i]) {
158            end = i;
159            break;
160        }
161    }
162    if end == 0 || start == end {
163        debug!("Start-Xref offset not normal end");
164        return Err(INVALID_PDF_FILE.into());
165    }
166    let offset = literal_to_u64(&buf[start..end]);
167    Ok(offset)
168}