pdf_rs/
document.rs

1use crate::catalog::{NodeId, OutlineTreeArean, PageTreeArean, decode_catalog_data, PageNode};
2use crate::constants::pdf_key::{START_XREF, XREF};
3use crate::constants::{
4    AUTHOR, CREATION_DATE, CREATOR, INFO, MOD_DATE, PREV, PRODUCER, ROOT, TITLE,
5};
6use crate::convert_glyph_from_dict;
7use crate::date::Date;
8use crate::encoding::PreDefinedEncoding;
9use crate::error::PDFError::{
10    InvalidPDFDocument, ObjectAttrMiss, PDFParseError, XrefTableNotFound,
11};
12use crate::error::Result;
13use crate::objects::{Dictionary, ObjRefTuple, PDFNumber, PDFObject, XEntry};
14use crate::parser::{parse, parse_text_xref, parse_with_offset};
15use crate::pstr::convert_glyph_text;
16use crate::sequence::{FileSequence, Sequence};
17use crate::tokenizer::Tokenizer;
18use crate::utils::{count_leading_line_endings, line_ending, literal_to_u64, xrefs_search};
19use crate::vpdf::PDFVersion;
20use std::path::PathBuf;
21use std::str::FromStr;
22
23pub struct PDFDescribe {
24    /// (Optional) The name of the application that converted the document from its native format to
25    /// PDF.
26    producer: Option<String>,
27    /// (Optional) If the document was converted into a PDF document from another form, the name
28    /// of the application that created the original document.
29    creator: Option<String>,
30    /// The date the document was created. It should be stored in an unambiguous format.
31    /// For example, 11 October 1992 13:11 is preferable to 11/10/92 1:11 pm. The date should
32    /// be in the same language as the document content.
33    creation_date: Option<Date>,
34    /// (Optional) The name of the person who created the document.
35    author: Option<String>,
36    /// (Optional; PDF 1.1) The document’s title.
37    title: Option<String>,
38    /// (Required if PieceInfo is present in the document catalogue;
39    /// otherwise optional; PDF 1.1) The date and time the document was
40    /// most recently modified, in human-readable form (see 7.9.4, “Dates”).
41    mod_date: Option<Date>,
42}
43
44/// Represents a PDF document with all its components and functionality.
45///
46/// This struct encapsulates a parsed PDF document, providing access to its cross-reference
47/// table, version information, tokenizer, and page structure.
48pub struct PDFDocument {
49    /// Cross-reference table containing references to all objects in the PDF.
50    xrefs: Vec<XEntry>,
51    /// PDF version information.
52    version: PDFVersion,
53    /// Tokenizer for parsing the PDF content.
54    tokenizer: Tokenizer,
55    /// Page tree arena containing the hierarchical page structure.
56    page_tree_arena: PageTreeArean,
57    /// Outline tree arena containing the hierarchical outline structure.
58    outline_tree_arean: Option<OutlineTreeArean>,
59    /// Document info
60    describe: Option<PDFDescribe>,
61}
62
63impl PDFDocument {
64    /// Opens a PDF document from a file path.
65    ///
66    /// This function opens a PDF file, reads its content, and parses it into a PDFDocument.
67    ///
68    /// # Arguments
69    ///
70    /// * `path` - The path to the PDF file to open
71    ///
72    /// # Returns
73    ///
74    /// A `Result` containing the parsed `PDFDocument` or an error if the file cannot be opened
75    /// or parsed correctly
76    pub fn open(path: PathBuf) -> Result<PDFDocument> {
77        let file = std::fs::File::open(path)?;
78        let sequence = FileSequence::new(file);
79        Self::new(sequence)
80    }
81
82    /// Creates a PDF document from a sequence of bytes.
83    ///
84    /// This function parses a sequence of bytes representing a PDF document and constructs
85    /// a PDFDocument instance with all its components.
86    ///
87    /// # Arguments
88    ///
89    /// * `sequence` - A sequence implementation providing access to the PDF bytes
90    ///
91    /// # Returns
92    ///
93    /// A `Result` containing the parsed `PDFDocument` or an error if parsing fails
94    pub fn new(mut sequence: impl Sequence + 'static) -> Result<PDFDocument> {
95        let version = parse_version(&mut sequence)?;
96        let offset = cal_xref_table_offset(&mut sequence)?;
97        let mut tokenizer = Tokenizer::new(sequence);
98        tokenizer.seek(offset)?;
99        // Merge all xref table
100        let (xrefs, catalog, info) = merge_xref_table(&mut tokenizer)?;
101        let (page_tree_arena, outline_tree_arean) = match catalog {
102            Some(catalog) => decode_catalog_data(&mut tokenizer, catalog, &xrefs)?,
103            None => return Err(ObjectAttrMiss("Trailer can't found catalog attr.")),
104        };
105        let mut describe = None;
106        // Parse document info
107        if let Some(obj) = info {
108            let entry = xrefs_search(&xrefs, obj)?;
109            if let PDFObject::IndirectObject(_, _, value) =
110                parse_with_offset(&mut tokenizer, entry.value)?
111            {
112                if let PDFObject::Dict(dict) = *value {
113                    describe = Some(PDFDescribe::new(dict));
114                }
115            }
116        }
117        let document = PDFDocument {
118            xrefs,
119            version,
120            tokenizer,
121            page_tree_arena,
122            outline_tree_arean,
123            describe,
124        };
125        Ok(document)
126    }
127
128    /// Gets a reference to the cross-reference table slice.
129    ///
130    /// # Returns
131    ///
132    /// A slice reference to the vector of cross-reference entries
133    pub fn get_xref_slice(&self) -> &[XEntry] {
134        &self.xrefs
135    }
136
137    /// Finds the index of a cross-reference entry that matches a condition.
138    ///
139    /// # Arguments
140    ///
141    /// * `visit` - A closure that takes a reference to an XEntry and returns a boolean
142    ///
143    /// # Returns
144    ///
145    /// An optional index of the first matching entry, or None if no entry matches
146    pub fn find_xref_index<F>(&self, visit: F) -> Option<usize>
147    where
148        F: Fn(&XEntry) -> bool,
149    {
150        self.xrefs.iter().position(visit)
151    }
152
153    /// Gets the PDF version information.
154    ///
155    /// # Returns
156    ///
157    /// A reference to the PDFVersion struct containing version information
158    pub fn get_version(&self) -> &PDFVersion {
159        &self.version
160    }
161
162    /// Reads an object from the PDF document by its index.
163    ///
164    /// # Arguments
165    ///
166    /// * `index` - The index of the object to read from the cross-reference table
167    ///
168    /// # Returns
169    ///
170    /// A `Result` containing an optional PDFObject (None if the index is out of bounds
171    /// or the object is freed) or an error if reading/parsing fails
172    pub fn read_object(&mut self, index: usize) -> Result<Option<PDFObject>> {
173        if index >= self.xrefs.len() {
174            return Ok(None);
175        }
176        let entry = &self.xrefs[index];
177        if entry.is_freed() {
178            return Ok(None);
179        }
180        self.tokenizer.seek(entry.get_value())?;
181        let object = parse(&mut self.tokenizer)?;
182        Ok(Some(object))
183    }
184
185    pub fn read_object_with_ref(&mut self, tuple: ObjRefTuple) -> Result<Option<PDFObject>> {
186        self.xrefs
187            .iter()
188            .position(|entry| entry.obj_num == tuple.0 && entry.gen_num == tuple.1)
189            .map(|index| self.read_object(index))
190            .unwrap_or(Ok(None))
191    }
192
193    pub fn get_page_num(&self) -> usize {
194        self.page_tree_arena.get_page_num()
195    }
196
197    pub fn get_page_ids(&self) -> Vec<NodeId> {
198        self.page_tree_arena.get_leaf_page_ids()
199    }
200
201    pub fn get_page(&self, node_id: NodeId) -> Option<&PageNode> {
202        self.page_tree_arena.get_page_node(node_id)
203    }
204}
205
206/// Parses the PDF version from the beginning of the document.
207///
208/// This function reads the first few bytes of a PDF document to extract and validate
209/// the PDF version information.
210///
211/// # Arguments
212///
213/// * `sequence` - A mutable reference to a sequence implementation for reading bytes
214///
215/// # Returns
216///
217/// A `Result` containing the parsed PDFVersion or an error if the version cannot be
218/// parsed or is invalid
219fn parse_version(sequence: &mut impl Sequence) -> Result<PDFVersion> {
220    let mut buf = [0u8; 1024];
221    let n = sequence.read(&mut buf)?;
222    if n < 8 {
223        return Err(InvalidPDFDocument);
224    }
225    if buf.len() < 8 || !buf.starts_with(b"%PDF-") {
226        return Err(InvalidPDFDocument);
227    }
228    let version = String::from_utf8(buf[5..8].to_vec())?;
229    Ok(version.try_into()?)
230}
231
232/// Merges cross-reference tables from a PDF document.
233///
234/// This function parses and merges multiple cross-reference tables that may exist
235/// in a PDF document, handling cases where there are previous xref tables referenced
236/// in the document trailer.
237///
238/// # Arguments
239///
240/// * `tokenizer` - A mutable reference to the tokenizer for parsing PDF content
241///
242/// # Returns
243///
244/// A `Result` containing a tuple with the merged vector of XEntry objects and
245/// a tuple of the catalog object number and generation number, or an error if
246/// parsing fails
247fn merge_xref_table(
248    mut tokenizer: &mut Tokenizer,
249) -> Result<(Vec<XEntry>, Option<(u32, u16)>, Option<(u32, u16)>)> {
250    let mut xrefs = Vec::<XEntry>::new();
251    let mut info = None;
252    let mut catalog = None;
253    loop {
254        let is_xref = tokenizer.check_next_token0(false, |token| token.key_was(XREF))?;
255        if !is_xref {
256            return Err(XrefTableNotFound);
257        }
258        let entries = parse_text_xref(tokenizer)?;
259        if xrefs.is_empty() {
260            xrefs.extend_from_slice(&entries);
261        } else {
262            for entry in entries {
263                if let None = xrefs.iter().find(|it| it.obj_num == entry.obj_num) {
264                    xrefs.push(entry);
265                }
266            }
267        }
268        if let PDFObject::Dict(dictionary) = parse(&mut tokenizer)? {
269            if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = dictionary.get(ROOT) {
270                catalog = Some((*obj_num, *gen_num));
271                if let Some(PDFObject::ObjectRef(obj_num, gen_num)) = dictionary.get(INFO) {
272                    info = Some((*obj_num, *gen_num));
273                }
274            }
275            // Recursive previous xref
276            if let Some(PDFObject::Number(PDFNumber::Unsigned(prev))) = dictionary.get(PREV) {
277                tokenizer.seek(*prev)?;
278                continue;
279            }
280            return Ok((xrefs, catalog, info));
281        }
282        return Err(PDFParseError("Xref table broken."));
283    }
284}
285
286/// Calculates the offset of the cross-reference table in the PDF document.
287///
288/// This function searches for the "startxref" keyword near the end of the document
289/// and extracts the offset value that points to the beginning of the cross-reference table.
290///
291/// # Arguments
292///
293/// * `sequence` - A mutable reference to a sequence implementation for reading bytes
294///
295/// # Returns
296///
297/// A `Result` containing the calculated offset as a u64 value, or an error if the
298/// startxref keyword cannot be found or the offset cannot be parsed
299fn cal_xref_table_offset(sequence: &mut impl Sequence) -> Result<u64> {
300    let size = sequence.size()?;
301    let pos = if size > 1024 { size - 1024 } else { 0 };
302    let mut buf = [0u8; 1024];
303    sequence.seek(pos)?;
304    let n = sequence.read(&mut buf)?;
305    let chars = START_XREF.as_bytes();
306    let mut tx = chars.len();
307    let mut index = n;
308    for i in (0..n).rev() {
309        let b = buf[i];
310        if chars[tx - 1] == b {
311            tx -= 1;
312            if tx == 0 {
313                index = i;
314                break;
315            }
316        }
317    }
318    // Can't find start xref
319    if index == n {
320        return Err(InvalidPDFDocument);
321    }
322    index = index + chars.len();
323    let crlf_num = count_leading_line_endings(&buf[index..n]);
324    let start = index + (crlf_num as usize);
325    let mut end = 0usize;
326    for i in start..n {
327        if line_ending(buf[i]) {
328            end = i;
329            break;
330        }
331    }
332    if end == 0 || start == end {
333        return Err(InvalidPDFDocument);
334    }
335    let offset = literal_to_u64(&buf[start..end]);
336    Ok(offset)
337}
338
339impl PDFDescribe {
340    pub(crate) fn new(dictionary: Dictionary) -> PDFDescribe {
341        let encoding = PreDefinedEncoding::PDFDoc;
342        let producer = convert_glyph_from_dict!(dictionary, PRODUCER, &encoding);
343        let creator = convert_glyph_from_dict!(dictionary, CREATOR, &encoding);
344        let creation_date =
345            convert_glyph_from_dict!(dictionary, CREATION_DATE, &encoding).map_or(None, |text| {
346                match Date::from_str(text.as_str()) {
347                    Ok(date) => Some(date),
348                    Err(_) => None,
349                }
350            });
351        let mod_date =
352            convert_glyph_from_dict!(dictionary, MOD_DATE, &encoding).map_or(None, |text| {
353                match Date::from_str(text.as_str()) {
354                    Ok(date) => Some(date),
355                    Err(_) => None,
356                }
357            });
358        let author = convert_glyph_from_dict!(dictionary, AUTHOR, &encoding);
359        let title = convert_glyph_from_dict!(dictionary, TITLE, &encoding);
360        PDFDescribe {
361            producer,
362            creator,
363            creation_date,
364            author,
365            title,
366            mod_date,
367        }
368    }
369}