pdf_rs/
document.rs

1use crate::constants::pdf_key::{START_XREF, TRAILER, XREF};
2use crate::error::error_kind::INVALID_PDF_FILE;
3use crate::error::Result;
4use crate::objects::{PDFObject, XEntry};
5use crate::parser::parse;
6use crate::sequence::{FileSequence, Sequence};
7use crate::tokenizer::Tokenizer;
8use crate::vpdf::PDFVersion;
9use std::path::PathBuf;
10use log::debug;
11use crate::bytes::{count_leading_line_endings, line_ending, literal_to_u64};
12
13/// Represent a PDF document
14pub struct PDFDocument {
15    /// Cross-reference table
16    xrefs: Vec<XEntry>,
17    /// PDF version
18    version: PDFVersion,
19    // Tokenizer
20    tokenizer: Tokenizer
21}
22
23impl PDFDocument {
24
25    /// Open a pdf document
26    pub fn open(path: PathBuf) -> Result<PDFDocument> {
27        let file = std::fs::File::open(path)?;
28        let sequence = FileSequence::new(file);
29        Self::new(sequence)
30    }
31
32    /// Create a pdf document from sequence
33    pub fn new(mut sequence: impl Sequence + 'static) -> Result<PDFDocument> {
34        let version = parse_version(&mut sequence)?;
35        let offset = cal_xref_table_offset(&mut sequence)?;
36        let mut tokenizer = Tokenizer::new(sequence);
37        tokenizer.seek(offset)?;
38        let xrefs = parse_xref(&mut tokenizer)?;
39        let document = PDFDocument {
40            xrefs,
41            version,
42            tokenizer,
43        };
44        Ok(document)
45    }
46    pub fn get_xref(&self) -> &Vec<XEntry> {
47        &self.xrefs
48    }
49    pub fn get_version(&self) -> &PDFVersion {
50        &self.version
51    }
52}
53
54fn parse_version(sequence: &mut impl Sequence) -> Result<PDFVersion> {
55    let mut buf = [0u8; 1024];
56    let n = sequence.read(&mut buf)?;
57    if n < 8 {
58        return Err(INVALID_PDF_FILE.into());
59    }
60    if buf.len() < 8
61        || buf[0] != 37
62        || buf[1] != 80
63        || buf[2] != 68
64        || buf[3] != 70
65        || buf[4] != 45
66    {
67        return Err(INVALID_PDF_FILE.into());
68    }
69    let version = String::from_utf8(buf[5..8].to_vec())?;
70    Ok(version.try_into()?)
71}
72
73fn parse_xref(mut tokenizer: &mut Tokenizer) -> Result<Vec<XEntry>> {
74    if let Some(PDFObject::Xref(entries)) = parse(&mut tokenizer, |token| token.key_was(XREF))? {
75        if let Some(PDFObject::Dict(dict)) = parse(&mut tokenizer, |token| token.key_was(TRAILER))? {
76            return Ok(entries)
77        }
78    }
79    Ok(vec![])
80}
81
82fn cal_xref_table_offset(sequence: &mut impl Sequence) -> Result<u64> {
83    let size = sequence.size()?;
84    let pos = if size > 1024 { size - 1024 } else { 0 };
85    let mut buf = [0u8; 1024];
86    sequence.seek(pos)?;
87    let n = sequence.read(&mut buf)?;
88    let chars = START_XREF.as_bytes();
89    let mut tx = chars.len();
90    let mut index = n;
91    for i in (0..n).rev() {
92        let b = buf[i];
93        if chars[tx - 1] == b {
94            tx -= 1;
95            if tx == 0 {
96                index = i;
97                break
98            }
99        }
100    }
101    // Can't find start xref
102    if index == n {
103        return Err(INVALID_PDF_FILE.into())
104    }
105    index = index + chars.len();
106    let crlf_num = count_leading_line_endings(&buf[index..n]);
107    let start = index + (crlf_num as usize);
108    let mut end = 0usize;
109    for i in start..n {
110        if line_ending(buf[i]) {
111            end = i;
112            break;
113        }
114    }
115    if end == 0 || start == end {
116        debug!("Start-Xref offset not normal end");
117        return Err(INVALID_PDF_FILE.into())
118    }
119    let offset = literal_to_u64(&buf[start..end]);
120    Ok(offset)
121}