pdf_rs/
document.rs

1use crate::utils::{count_leading_line_endings, line_ending, literal_to_u64};
2use crate::constants::pdf_key::{START_XREF, TRAILER, XREF};
3use crate::constants::{PREV, ROOT};
4use crate::error::error_kind::{
5    CANT_FIND_ROOT, EXCEPT_TRAILER, INVALID_PDF_FILE, NO_XREF_TABLE_FOUND,
6};
7use crate::error::{Result};
8use crate::objects::{PDFNumber, PDFObject, XEntry};
9use crate::parser::{parse, parse_text_xref};
10use crate::sequence::{FileSequence, Sequence};
11use crate::tokenizer::Tokenizer;
12use crate::vpdf::PDFVersion;
13use log::debug;
14use std::path::PathBuf;
15use crate::catalog::{create_page_tree_arena, PageTreeArean};
16
17/// Represent a PDF document
18pub struct PDFDocument {
19    /// Cross-reference table
20    xrefs: Vec<XEntry>,
21    /// PDF version
22    version: PDFVersion,
23    /// Tokenizer
24    tokenizer: Tokenizer,
25    // Page Tree Arena
26    page_tree_arena: PageTreeArean
27}
28
29impl PDFDocument {
30    /// Open a pdf document
31    pub fn open(path: PathBuf) -> Result<PDFDocument> {
32        let file = std::fs::File::open(path)?;
33        let sequence = FileSequence::new(file);
34        Self::new(sequence)
35    }
36
37    /// Create a pdf document from sequence
38    pub fn new(mut sequence: impl Sequence + 'static) -> Result<PDFDocument> {
39        let version = parse_version(&mut sequence)?;
40        let offset = cal_xref_table_offset(&mut sequence)?;
41        let mut tokenizer = Tokenizer::new(sequence);
42        tokenizer.seek(offset)?;
43        // Merge all xref table
44        let (xrefs, catalog) = merge_xref_table(&mut tokenizer)?;
45        let page_tree_arena = create_page_tree_arena(&mut tokenizer,catalog,&xrefs)?;
46        let document = PDFDocument {
47            xrefs,
48            version,
49            tokenizer,
50            page_tree_arena
51        };
52        Ok(document)
53    }
54    /// Get xref slice
55    pub fn get_xref_slice(&self) -> &[XEntry] {
56        &self.xrefs
57    }
58    /// Find xref index
59    pub fn find_xref_index<F>(&self, visit: F) -> Option<usize>
60    where
61        F: Fn(&XEntry) -> bool,
62    {
63        self.xrefs.iter().position(visit)
64    }
65    /// Get PDF version
66    pub fn get_version(&self) -> &PDFVersion {
67        &self.version
68    }
69    /// Read object from PDFDocument
70    pub fn read_object(&mut self, index: usize) -> Result<Option<PDFObject>> {
71        if index >= self.xrefs.len() {
72            return Ok(None);
73        }
74        let entry = &self.xrefs[index];
75        if entry.is_freed() {
76            return Ok(None);
77        }
78        self.tokenizer.seek(entry.get_value())?;
79        let object = parse(&mut self.tokenizer)?;
80        Ok(Some(object))
81    }
82
83    /// Get pdf page number
84    pub fn get_page_num(&self) -> usize {
85        self.page_tree_arena.get_page_num()
86    }
87}
88
89fn parse_version(sequence: &mut impl Sequence) -> Result<PDFVersion> {
90    let mut buf = [0u8; 1024];
91    let n = sequence.read(&mut buf)?;
92    if n < 8 {
93        return Err(INVALID_PDF_FILE.into());
94    }
95    if buf.len() < 8
96        || buf[0] != b'%'
97        || buf[1] != b'P'
98        || buf[2] != b'D'
99        || buf[3] != b'F'
100        || buf[4] != b'-'
101    {
102        return Err(INVALID_PDF_FILE.into());
103    }
104    let version = String::from_utf8(buf[5..8].to_vec())?;
105    Ok(version.try_into()?)
106}
107
108fn merge_xref_table(mut tokenizer: &mut Tokenizer) -> Result<(Vec<XEntry>, (u64, u64))> {
109    let mut xrefs = Vec::<XEntry>::new();
110    let mut root = None;
111    loop {
112        let is_xref = tokenizer.check_next_token0(false, |token| token.key_was(XREF))?;
113        if !is_xref {
114            return Err(NO_XREF_TABLE_FOUND.into());
115        }
116        let entries = parse_text_xref(tokenizer)?;
117        if xrefs.is_empty() {
118            xrefs.extend_from_slice(&entries);
119        } else {
120            for entry in entries {
121                if let None = xrefs.iter().find(|it| it.obj_num == entry.obj_num) {
122                    xrefs.push(entry);
123                }
124            }
125        }
126        if let PDFObject::Dict(mut dictionary) = parse(&mut tokenizer)? {
127            if let Some(obj) = dictionary.remove(ROOT) {
128                root = Some(obj);
129            }
130            if let Some(PDFObject::Number(PDFNumber::Unsigned(prev))) = dictionary.get(PREV) {
131                tokenizer.seek(*prev)?;
132                continue;
133            }
134            if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = root {
135                return Ok((xrefs, (obj_num, gen_num)));
136            }
137            return Err(CANT_FIND_ROOT.into());
138        }
139        return Err(EXCEPT_TRAILER.into());
140    }
141}
142fn cal_xref_table_offset(sequence: &mut impl Sequence) -> Result<u64> {
143    let size = sequence.size()?;
144    let pos = if size > 1024 { size - 1024 } else { 0 };
145    let mut buf = [0u8; 1024];
146    sequence.seek(pos)?;
147    let n = sequence.read(&mut buf)?;
148    let chars = START_XREF.as_bytes();
149    let mut tx = chars.len();
150    let mut index = n;
151    for i in (0..n).rev() {
152        let b = buf[i];
153        if chars[tx - 1] == b {
154            tx -= 1;
155            if tx == 0 {
156                index = i;
157                break;
158            }
159        }
160    }
161    // Can't find start xref
162    if index == n {
163        return Err(INVALID_PDF_FILE.into());
164    }
165    index = index + chars.len();
166    let crlf_num = count_leading_line_endings(&buf[index..n]);
167    let start = index + (crlf_num as usize);
168    let mut end = 0usize;
169    for i in start..n {
170        if line_ending(buf[i]) {
171            end = i;
172            break;
173        }
174    }
175    if end == 0 || start == end {
176        debug!("Start-Xref offset not normal end");
177        return Err(INVALID_PDF_FILE.into());
178    }
179    let offset = literal_to_u64(&buf[start..end]);
180    Ok(offset)
181}