pdf_rs/
document.rs

1use crate::catalog::{create_page_tree_arena, PageTreeArean};
2use crate::constants::pdf_key::{START_XREF, XREF};
3use crate::constants::{PREV, ROOT};
4use crate::error::PDFError::{InvalidPDFDocument, PDFParseError, XrefTableNotFound};
5use crate::error::Result;
6use crate::objects::{PDFNumber, PDFObject, XEntry};
7use crate::parser::{parse, parse_text_xref};
8use crate::sequence::{FileSequence, Sequence};
9use crate::tokenizer::Tokenizer;
10use crate::utils::{count_leading_line_endings, line_ending, literal_to_u64};
11use std::path::PathBuf;
12use crate::vpdf::PDFVersion;
13
14/// Represent a PDF document
15pub struct PDFDocument {
16    /// Cross-reference table
17    xrefs: Vec<XEntry>,
18    /// PDF version
19    version: PDFVersion,
20    /// Tokenizer
21    tokenizer: Tokenizer,
22    // Page Tree Arena
23    page_tree_arena: PageTreeArean
24}
25
26impl PDFDocument {
27    /// Open a pdf document
28    pub fn open(path: PathBuf) -> Result<PDFDocument> {
29        let file = std::fs::File::open(path)?;
30        let sequence = FileSequence::new(file);
31        Self::new(sequence)
32    }
33
34    /// Create a pdf document from sequence
35    pub fn new(mut sequence: impl Sequence + 'static) -> Result<PDFDocument> {
36        let version = parse_version(&mut sequence)?;
37        let offset = cal_xref_table_offset(&mut sequence)?;
38        let mut tokenizer = Tokenizer::new(sequence);
39        tokenizer.seek(offset)?;
40        // Merge all xref table
41        let (xrefs, catalog) = merge_xref_table(&mut tokenizer)?;
42        let page_tree_arena = create_page_tree_arena(&mut tokenizer,catalog,&xrefs)?;
43        let document = PDFDocument {
44            xrefs,
45            version,
46            tokenizer,
47            page_tree_arena
48        };
49        Ok(document)
50    }
51    /// Get xref slice
52    pub fn get_xref_slice(&self) -> &[XEntry] {
53        &self.xrefs
54    }
55    /// Find xref index
56    pub fn find_xref_index<F>(&self, visit: F) -> Option<usize>
57    where
58        F: Fn(&XEntry) -> bool,
59    {
60        self.xrefs.iter().position(visit)
61    }
62    /// Get PDF version
63    pub fn get_version(&self) -> &PDFVersion {
64        &self.version
65    }
66    /// Read object from PDFDocument
67    pub fn read_object(&mut self, index: usize) -> Result<Option<PDFObject>> {
68        if index >= self.xrefs.len() {
69            return Ok(None);
70        }
71        let entry = &self.xrefs[index];
72        if entry.is_freed() {
73            return Ok(None);
74        }
75        self.tokenizer.seek(entry.get_value())?;
76        let object = parse(&mut self.tokenizer)?;
77        Ok(Some(object))
78    }
79
80    /// Get pdf page number
81    pub fn get_page_num(&self) -> usize {
82        self.page_tree_arena.get_page_num()
83    }
84}
85
86fn parse_version(sequence: &mut impl Sequence) -> Result<PDFVersion> {
87    let mut buf = [0u8; 1024];
88    let n = sequence.read(&mut buf)?;
89    if n < 8 {
90        return Err(InvalidPDFDocument);
91    }
92    if buf.len() < 8
93        || buf[0] != b'%'
94        || buf[1] != b'P'
95        || buf[2] != b'D'
96        || buf[3] != b'F'
97        || buf[4] != b'-'
98    {
99        return Err(InvalidPDFDocument);
100    }
101    let version = String::from_utf8(buf[5..8].to_vec())?;
102    Ok(version.try_into()?)
103}
104
105fn merge_xref_table(mut tokenizer: &mut Tokenizer) -> Result<(Vec<XEntry>, (u64, u64))> {
106    let mut xrefs = Vec::<XEntry>::new();
107    let mut root = None;
108    loop {
109        let is_xref = tokenizer.check_next_token0(false, |token| token.key_was(XREF))?;
110        if !is_xref {
111            return Err(XrefTableNotFound);
112        }
113        let entries = parse_text_xref(tokenizer)?;
114        if xrefs.is_empty() {
115            xrefs.extend_from_slice(&entries);
116        } else {
117            for entry in entries {
118                if let None = xrefs.iter().find(|it| it.obj_num == entry.obj_num) {
119                    xrefs.push(entry);
120                }
121            }
122        }
123        if let PDFObject::Dict(mut dictionary) = parse(&mut tokenizer)? {
124            if let Some(obj) = dictionary.remove(ROOT) {
125                root = Some(obj);
126            }
127            // Recursive previous xref
128            if let Some(PDFObject::Number(PDFNumber::Unsigned(prev))) = dictionary.get(PREV) {
129                tokenizer.seek(*prev)?;
130                continue;
131            }
132            if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = root {
133                return Ok((xrefs, (obj_num, gen_num)));
134            }
135        }
136        return Err(PDFParseError("Xref table broken."));
137    }
138}
139fn cal_xref_table_offset(sequence: &mut impl Sequence) -> Result<u64> {
140    let size = sequence.size()?;
141    let pos = if size > 1024 { size - 1024 } else { 0 };
142    let mut buf = [0u8; 1024];
143    sequence.seek(pos)?;
144    let n = sequence.read(&mut buf)?;
145    let chars = START_XREF.as_bytes();
146    let mut tx = chars.len();
147    let mut index = n;
148    for i in (0..n).rev() {
149        let b = buf[i];
150        if chars[tx - 1] == b {
151            tx -= 1;
152            if tx == 0 {
153                index = i;
154                break;
155            }
156        }
157    }
158    // Can't find start xref
159    if index == n {
160        return Err(InvalidPDFDocument);
161    }
162    index = index + chars.len();
163    let crlf_num = count_leading_line_endings(&buf[index..n]);
164    let start = index + (crlf_num as usize);
165    let mut end = 0usize;
166    for i in start..n {
167        if line_ending(buf[i]) {
168            end = i;
169            break;
170        }
171    }
172    if end == 0 || start == end {
173        return Err(InvalidPDFDocument);
174    }
175    let offset = literal_to_u64(&buf[start..end]);
176    Ok(offset)
177}