pdf_rs/
document.rs

1use crate::bytes::{count_leading_line_endings, line_ending, literal_to_u64};
2use crate::constants::pdf_key::{START_XREF, TRAILER, XREF};
3use crate::error::Result;
4use crate::error::error_kind::{INVALID_PDF_FILE, NO_XREF_TABLE_FOUND};
5use crate::objects::{PDFNumber, PDFObject, XEntry};
6use crate::parser::{parse, parse_text_xref};
7use crate::sequence::{FileSequence, Sequence};
8use crate::tokenizer::Tokenizer;
9use crate::vpdf::PDFVersion;
10use log::debug;
11use std::path::PathBuf;
12use crate::constants::PREV;
13
14/// Represent a PDF document
15pub struct PDFDocument {
16    /// Cross-reference table
17    xrefs: Vec<XEntry>,
18    /// PDF version
19    version: PDFVersion,
20    // Tokenizer
21    tokenizer: Tokenizer,
22}
23
24impl PDFDocument {
25    /// Open a pdf document
26    pub  fn open(path: PathBuf) -> Result<PDFDocument> {
27        let file = std::fs::File::open(path)?;
28        let sequence = FileSequence::new(file);
29        Self::new(sequence)
30    }
31
32    /// Create a pdf document from sequence
33    pub fn new(mut sequence: impl Sequence + 'static) -> Result<PDFDocument> {
34        let version = parse_version(&mut sequence)?;
35        let offset = cal_xref_table_offset(&mut sequence)?;
36        let mut tokenizer = Tokenizer::new(sequence);
37        tokenizer.seek(offset)?;
38        let mut xrefs = Vec::<XEntry>::new();
39        // Merge all xref table
40        merge_xref_table(&mut tokenizer,&mut xrefs)?;
41        let document = PDFDocument {
42            xrefs,
43            version,
44            tokenizer,
45        };
46        Ok(document)
47    }
48    /// Get xref slice
49    pub fn get_xref_slice(&self) -> &[XEntry] {
50        &self.xrefs
51    }
52    /// Get PDF version
53    pub fn get_version(&self) -> &PDFVersion {
54        &self.version
55    }
56    /// Read object from PDFDocument
57    pub fn read_object(&mut self,index: usize) -> Result<Option<PDFObject>> {
58        if index >= self.xrefs.len() {
59            return Ok(None);
60        }
61        let entry = &self.xrefs[index];
62        if entry.is_freed() {
63            return Ok(None);
64        }
65        self.tokenizer.seek(entry.get_value())?;
66        let object = parse(&mut self.tokenizer)?;
67        Ok(Some(object))
68    }
69}
70
71fn parse_version(sequence: &mut impl Sequence) -> Result<PDFVersion> {
72    let mut buf = [0u8; 1024];
73    let n = sequence.read(&mut buf)?;
74    if n < 8 {
75        return Err(INVALID_PDF_FILE.into());
76    }
77    if buf.len() < 8
78        || buf[0] != b'%'
79        || buf[1] != b'P'
80        || buf[2] != b'D'
81        || buf[3] != b'F'
82        || buf[4] != b'-'
83    {
84        return Err(INVALID_PDF_FILE.into());
85    }
86    let version = String::from_utf8(buf[5..8].to_vec())?;
87    Ok(version.try_into()?)
88}
89
90fn merge_xref_table(mut tokenizer: &mut Tokenizer,mut xrefs: &mut Vec<XEntry>) -> Result<()> {
91    let is_xref = tokenizer.check_next_token0(false, |token| token.key_was(XREF))?;
92    if !is_xref {
93        return Err(NO_XREF_TABLE_FOUND.into());
94    }
95    let entries = parse_text_xref(tokenizer)?;
96    if xrefs.is_empty() {
97        xrefs.extend_from_slice(&entries);
98    } else {
99        for entry in entries {
100            if let None = xrefs.iter().find(|it| it.obj_num == entry.obj_num) {
101                xrefs.push(entry);
102            }
103        }
104    }
105    let is_trailer = tokenizer.check_next_token0(false, |token| token.key_was(TRAILER))?;
106    if is_trailer {
107        match parse(&mut tokenizer)?.as_dict() {
108            Some(dict) => {
109                match dict.get(PREV) {
110                    Some(PDFObject::Number(PDFNumber::Unsigned(offset)))=>{
111                        tokenizer.seek(*offset)?;
112                        merge_xref_table(tokenizer,xrefs)?;
113                    }
114                    _ => {}
115                }
116            }
117            None => {}
118        }
119    }
120    Ok(())
121}
122fn cal_xref_table_offset(sequence: &mut impl Sequence) -> Result<u64> {
123    let size = sequence.size()?;
124    let pos = if size > 1024 { size - 1024 } else { 0 };
125    let mut buf = [0u8; 1024];
126    sequence.seek(pos)?;
127    let n = sequence.read(&mut buf)?;
128    let chars = START_XREF.as_bytes();
129    let mut tx = chars.len();
130    let mut index = n;
131    for i in (0..n).rev() {
132        let b = buf[i];
133        if chars[tx - 1] == b {
134            tx -= 1;
135            if tx == 0 {
136                index = i;
137                break;
138            }
139        }
140    }
141    // Can't find start xref
142    if index == n {
143        return Err(INVALID_PDF_FILE.into());
144    }
145    index = index + chars.len();
146    let crlf_num = count_leading_line_endings(&buf[index..n]);
147    let start = index + (crlf_num as usize);
148    let mut end = 0usize;
149    for i in start..n {
150        if line_ending(buf[i]) {
151            end = i;
152            break;
153        }
154    }
155    if end == 0 || start == end {
156        debug!("Start-Xref offset not normal end");
157        return Err(INVALID_PDF_FILE.into());
158    }
159    let offset = literal_to_u64(&buf[start..end]);
160    Ok(offset)
161}