pdf_rs/
objects.rs

1use std::collections::HashMap;
2use crate::constants::FILTER;
3
4/// Type alias for an object reference tuple containing object number and generation number.
5pub type ObjRefTuple = (u32, u16);
6
7/// Represents a numeric value in a PDF document.
8///
9/// PDF supports three types of numbers: signed integers, unsigned integers, and real numbers.
10#[derive(PartialEq, Clone)]
11pub enum PDFNumber {
12    /// A signed integer value.
13    Signed(i64),
14    /// An unsigned integer value.
15    Unsigned(u64),
16    /// A real (floating-point) value.
17    Real(f64),
18}
19
20/// Represents a cross-reference table entry.
21///
22/// XRef entries map object numbers to their file positions and track whether objects are in use.
23#[derive(Clone)]
24pub struct XEntry {
25    /// The value of the entry.
26    pub(crate) value: u64,
27    /// The entry is either in use or deleted.
28    pub(crate) using: bool,
29    /// The object number of the entry.
30    pub(crate) obj_num: u32,
31    /// The generation number of the entry.
32    pub(crate) gen_num: u16,
33}
34
35/// Represents a PDF dictionary object.
36///
37/// Dictionaries are associative tables containing key-value pairs where keys are names
38/// and values can be any PDF object type.
39pub struct Dictionary {
40    entries: HashMap<String, PDFObject>,
41}
42
43/// Represents a PDF stream object.
44///
45/// Streams contain large amounts of data (like images or page content) with associated metadata.
46pub struct Stream {
47    buf: Vec<u8>,
48    metadata: Dictionary,
49}
50
51/// Represents the kind of PDF string encoding.
52#[derive(PartialEq)]
53pub(crate) enum PDFStrKind {
54    /// Literal string enclosed in parentheses.
55    Literal,
56    /// Hexadecimal string enclosed in angle brackets.
57    Hexadecimal,
58}
59
60/// Represents a PDF string object.
61///
62/// Strings can be either literal or hexadecimal encoded.
63pub struct PDFString {
64    kind: PDFStrKind,
65    buf: Vec<u8>,
66}
67
68pub enum PDFObject {
69    /// The keywords true and false represent boolean objects with values true and false.
70    Bool(bool),
71    /// ## Numbers
72    /// PDF provides two types of numbers, integer and real. Integers may be specified by
73    /// signed or unsigned constants. Reals may only be in decimal format. Throughout
74    /// this book, number means an object whose type is either integer or real.</br>
75    /// `Note Exponential format for numbers (such as 1.0E3) is not supported.`
76    Number(PDFNumber),
77    /// ## Names
78    /// A name, like a string, is a sequence of characters. It must begin with a slash fol-
79    /// lowed by a letter, followed by a sequence of characters. Names may contain any
80    /// characters except linefeed, carriage return, %, (, ), <, >, [, ], {, and }. Examples of
81    /// names are:
82    /// ```plaintext
83    ///  /Name1
84    ///  /ASomewhatLongerName2
85    ///  /A;Name_With-various***characters?.
86    /// ```
87    Named(String),
88    String(PDFString),
89    /// ## Arrays
90    /// An array is a sequence of PDF objects. An array may contain a mixture of object
91    /// types. An array is represented as a left square bracket ( [ ), followed by a sequence
92    /// of objects, followed by a right square bracket ( ] ). An example of an array is:</br>
93    /// ```plaintext
94    /// [ 0 (Higgs) false 3.14 3 549 /SomeName ]
95    /// ```
96    Array(Vec<PDFObject>),
97    /// A dictionary is an associative table containing pairs of objects. The first element of
98    /// each pair is called the key and the second element is called the value. Unlike dictio-
99    /// naries in the PostScript language, a key must
100    /// be a name. A value can be any kind of object, including a dictionary.
101    /// A dictionary is generally used to collect and tie together the attributes of a complex
102    /// object, with each key–value pair specifying the name and value of an attribute.
103    ///
104    /// A dictionary is represented by two left angle brackets (<<), followed by a sequence
105    /// of key–value pairs, followed by two right angle brackets (>>). For example:
106    /// Example 4.1 Dictionary
107    /// << /Type /Example /Key2 12 /Key3 (a string) >>
108    /// Or, in an example of a dictionary within a dictionary:
109    /// ```plaintext
110    /// << /Type /AlsoAnExample
111    /// /Subtype /Bad
112    /// /Reason (unsure)
113    /// /Version 0.01
114    /// /MyInfo <<
115    /// /Item1 0.4
116    /// /Item2 true
117    /// /LastItem (not!)
118    /// /VeryLastItem (OK)
119    /// >>
120    /// >>
121    /// ```
122    /// Dictionary objects are the main building blocks of a PDF document. Many parts of
123    /// a PDF document, such as pages and fonts, are represented using dictionaries. By
124    /// convention, the **Type** key of such a dictionary specifies the type of object being
125    /// described by the dictionary. Its value is always a name. In some cases, the **Subtype**
126    /// key is used to describe a specialization of a particular type. Its value is always a
127    /// name. For a font, Type is **Font** and four Subtypes exist: Type1, MMType1,
128    /// Type3, and TrueType.
129    Dict(Dictionary),
130    Null,
131    /// Any object used as an element of an array or as a value in a dictionary may be
132    /// specified by either a direct object or an indirect reference. An indirect reference is a
133    /// reference to an indirect object, and consists of the indirect object’s object number,
134    /// generation number, and the **R** keyword:
135    /// ```plaintext
136    /// <indirect reference> ::=
137    /// <object number>
138    /// <generation number>
139    /// R
140    /// ```
141    /// Using an indirect reference to the stream’s length, a stream could be written as:
142    /// ```plaintext
143    /// 7 0 obj
144    /// <<
145    /// /Length 8 0 R
146    /// >>
147    /// stream
148    /// BT
149    /// /F1 12 Tf
150    /// 72 712 Td (A stream with an indirect Length) Tj
151    /// ET
152    /// endstream
153    /// endobj
154    /// 8 0 obj
155    /// 64
156    /// endobj
157    /// ```
158    ObjectRef(u32, u16),
159    /// A direct object is a boolean, number, string, name, array, dictionary, stream, or null,
160    /// as described in the previous sections. An indirect object is an object that has been
161    /// labeled so that it can be referenced by other objects. Any type of object may be an
162    /// indirect object. Indirect objects are very useful; for example, if the length of a
163    /// stream is not known before it is written, the value of the stream’s **Length** key may
164    /// be specified as an indirect object that is stored in the file after the stream.</br>
165    /// An indirect object consists of an object identifier, a direct object, and the **endobj**
166    /// keyword. The object identifier consists of an integer object number, an integer gen-
167    /// eration number, and the **obj** keyword:
168    /// ```plaintext
169    /// <indirect object> ::=
170    /// <object ID> ::=
171    /// <object ID>
172    /// <direct object>
173    /// endobj
174    /// <object number>
175    /// <generation number>
176    /// obj
177    /// ```
178    /// The combination of object number and generation number serves as a unique iden-
179    /// tifier for an indirect object. Throughout its existence, an indirect object retains the
180    /// object number and generation number it was initially assigned, even if the object is
181    /// modified.</br>
182    /// Each indirect object has a unique object number, and indirect objects are often but
183    /// not necessarily numbered sequentially in the file, beginning with o
184    IndirectObject(u32, u16, Box<PDFObject>),
185    /// ## Streams
186    /// A stream, like a string, is a sequence of characters. However, an application can
187    /// read a small portion of a stream at a time, while a string must be read in its entirety.
188    /// For this reason, objects with potentially large amounts of data, such as images and
189    /// page descriptions, are represented as streams.
190    ///
191    /// A stream consists of a dictionary that describes a sequence of characters, followed
192    /// by the keyword stream, followed by one or more lines of characters, followed by
193    /// the keyword endstream.
194    /// ```plaintext
195    /// <stream> ::= <dictionary>
196    /// stream
197    /// {<lines of characters>}*
198    /// endstream
199    /// ```
200    Stream(Stream),
201}
202
203impl PDFObject {
204    /// Returns true if the object is a boolean.
205    pub fn is_bool(&self) -> bool {
206        match self {
207            PDFObject::Bool(_) => true,
208            _ => false,
209        }
210    }
211    /// Returns the boolean value of the object if it is a boolean.
212    pub fn as_bool(&self) -> Option<bool> {
213        match self {
214            PDFObject::Bool(b) => Some(*b),
215            _ => None,
216        }
217    }
218
219    /// Returns true if the object is a number.
220    pub fn is_number(&self) -> bool {
221        match self {
222            PDFObject::Number(_) => true,
223            _ => false,
224        }
225    }
226    /// Returns the number value of the object if it is a number.
227    pub fn as_number(&self) -> Option<&PDFNumber> {
228        match self {
229            PDFObject::Number(n) => Some(n),
230            _ => None,
231        }
232    }
233    /// Returns true if the object is a string.
234    pub fn is_string(&self) -> bool {
235        match self {
236            PDFObject::String(_) => true,
237            _ => false,
238        }
239    }
240
241    /// Returns the string value of the object if it is a string.
242    pub fn as_string(&self) -> Option<&PDFString> {
243        match self {
244            PDFObject::String(s) => Some(s),
245            _ => None,
246        }
247    }
248
249    /// Returns the string value of the object if it is a string.
250    pub fn is_array(&self) -> bool {
251        match self {
252            PDFObject::Array(_) => true,
253            _ => false,
254        }
255    }
256    /// Returns the array of objects if it is an array.
257    pub fn as_array(&self) -> Option<&[PDFObject]> {
258        match self {
259            PDFObject::Array(a) => Some(a),
260            _ => None,
261        }
262    }
263    /// Returns true if the object is a dictionary.
264    pub fn is_dict(&self) -> bool {
265        match self {
266            PDFObject::Dict(_) => true,
267            _ => false,
268        }
269    }
270    /// Returns the dictionary if it is one.
271    pub fn as_dict(&self) -> Option<&Dictionary> {
272        match self {
273            PDFObject::Dict(d) => Some(d),
274            _ => None,
275        }
276    }
277    /// Returns the dictionary if it is one.
278    pub fn to_dict(self) -> Option<Dictionary> {
279        match self {
280            PDFObject::Dict(d) => Some(d),
281            _ => None,
282        }
283    }
284    /// Returns true if the object is an indirect object.
285    pub fn is_object_ref(&self) -> bool {
286        match self {
287            PDFObject::ObjectRef(_, ..) => true,
288            _ => false,
289        }
290    }
291    /// Returns the object reference if it is one.
292    pub fn as_object_ref(&self) -> Option<(u32, u16)> {
293        match self {
294            PDFObject::ObjectRef(n, g) => Some((*n, *g)),
295            _ => None,
296        }
297    }
298
299    /// Returns true if the object is an indirect object.
300    pub fn is_indirect_object(&self) -> bool {
301        match self {
302            PDFObject::IndirectObject(_, _, _) => true,
303            _ => false,
304        }
305    }
306    /// Returns the indirect object if it is one.
307    pub fn as_indirect_object(&self) -> Option<(u32, u16, &PDFObject)> {
308        match self {
309            PDFObject::IndirectObject(n, g, data) => Some((*n, *g, data)),
310            _ => None,
311        }
312    }
313
314    /// Returns true if the object is null.
315    pub fn is_null(&self) -> bool {
316        match self {
317            PDFObject::Null => true,
318            _ => false,
319        }
320    }
321    /// Returns true if the object is a stream.
322    pub fn is_stream(&self)->bool{
323        match self {
324            PDFObject::Stream(_) => true,
325            _ => false,
326        }
327    }
328
329    /// Returns the stream if it is one.
330    pub fn as_stream(&self)->Option<&Stream>{
331        match self {
332            PDFObject::Stream(s) => Some(s),
333            _ => None,
334        }
335    }
336    /// Returns true if the object is a name.
337    pub fn is_name(&self)->bool{
338        match self {
339            PDFObject::Named(_) => true,
340            _ => false,
341        }
342    }
343    /// Returns the name if it is one.
344    pub fn as_name(&self)->Option<&String>{
345        match self {
346            PDFObject::Named(s) => Some(s),
347            _ => None,
348        }
349    }
350
351}
352
353impl Dictionary {
354    /// Creates a new dictionary with the given entries.
355    pub(crate) fn new(entries: HashMap<String, PDFObject>) -> Self {
356        Dictionary { entries }
357    }
358    /// Returns the value of the entry with the given key.
359    pub fn get(&self, key: &str)-> Option<&PDFObject> {
360        self.entries.get(key)
361    }
362
363    /// Removes the entry with the given key.
364    pub fn remove(&mut self,key:&str)->Option<PDFObject>{
365        self.entries.remove(key)
366    }
367    /// Returns true if the dictionary contains the given key.
368    pub fn contain(&self, key: &str)->bool{
369        self.entries.contains_key(key)
370    }
371
372    /// Returns the value of the entry with the given key as a name.
373    pub fn get_named_value(&self, key: &str) -> Option<&String> {
374        self.get(key).and_then(|it| it.as_name())
375    }
376
377
378    /// Returns the value of the entry with the given key as a u64.
379    pub fn get_u64_num(&self, key: &str) -> Option<u64> {
380        self.get(key)
381            .and_then(|it| it.as_number())
382            .and_then(|it| if let PDFNumber::Unsigned(num) = it { Some(*num) } else { None })
383    }
384
385    /// Returns true if the value of the entry with the given key is the given name.
386    pub fn named_value_was(&self, keys: &str,except:&str) -> bool {
387        if let Some(value) = self.get_named_value(keys) {
388            value == except
389        } else {
390            false
391        }
392    }
393
394    /// Returns the value of the entry with the given key as an array.
395    pub fn get_array_value(&self, key: &str) -> Option<&[PDFObject]> {
396        self.get(key).and_then(|it| it.as_array())
397    }
398}
399
400impl XEntry {
401    pub(crate) fn new(obj_num: u32, gen_num: u16, value: u64, using: bool) -> Self {
402        XEntry {
403            obj_num,
404            gen_num,
405            using,
406            value,
407        }
408    }
409    /// Returns the object number of the entry.
410    pub fn get_obj_num(&self)->u32{
411        self.obj_num
412    }
413    /// Returns the generation number of the entry.
414    pub fn get_gen_num(&self)->u16{
415        self.gen_num
416    }
417    /// Returns true if the entry is currently being used.
418    pub fn is_using(&self) -> bool {
419        self.using
420    }
421
422    /// Returns true if the entry is freed.
423    pub fn is_freed(&self)->bool{
424        !self.using
425    }
426    /// Returns the value of the entry.
427    pub fn get_value(&self)->u64{
428        self.value
429    }
430}
431
432impl Stream {
433    /// Creates a new stream with the given metadata and buffer.
434    ///
435    /// # Arguments
436    ///
437    /// * `metadata` - A dictionary containing stream metadata
438    /// * `buf` - The byte buffer containing the stream data
439    ///
440    /// # Returns
441    ///
442    /// A new `Stream` instance
443    pub(crate) fn new(metadata: Dictionary,buf:Vec<u8>) -> Self {
444        Stream { buf, metadata }
445    }
446
447    /// Returns a slice reference to the stream's byte buffer.
448    ///
449    /// # Returns
450    ///
451    /// A slice reference to the internal byte buffer
452    pub(crate) fn as_slice(&self) -> &[u8] {
453        &self.buf
454    }
455
456
457    pub(crate) fn get_filters(&self) -> Vec<String> {
458        match self.metadata.get(FILTER){
459            Some(PDFObject::Array(arr)) => {
460                arr.iter()
461                    .filter_map(|it| it.as_name())
462                    .map(|it| it.clone())
463                    .collect()
464            }
465            Some(PDFObject::Named(name)) => {
466                vec![name.clone()]
467            }
468            _ => vec![]
469        }
470    } 
471}
472
473impl PDFString {
474    /// Creates a new PDF string with the specified kind and buffer.
475    ///
476    /// # Arguments
477    ///
478    /// * `kind` - The encoding kind of the string (Literal or Hexadecimal)
479    /// * `buf` - The byte buffer containing the string data
480    ///
481    /// # Returns
482    ///
483    /// A new `PDFString` instance
484    pub(crate) fn new(kind: PDFStrKind, buf: Vec<u8>) -> Self {
485        PDFString { kind, buf }
486    }
487
488    /// Returns a reference to the string's byte buffer.
489    ///
490    /// # Returns
491    ///
492    /// A reference to the internal byte buffer
493    pub(crate) fn get_buf(&self) -> &Vec<u8> {
494        &self.buf
495    }
496
497    /// Returns the encoding kind of the string.
498    ///
499    /// # Returns
500    ///
501    /// A reference to the `PDFStrKind` indicating the encoding type
502    pub(crate) fn get_kind(&self) -> &PDFStrKind {
503        &self.kind
504    }
505
506    /// Returns true if the string is in UTF-16BE encoding.
507    ///
508    /// This checks if the string is hexadecimal encoded and starts with the
509    /// UTF-16BE byte order mark (BOM) 0xFE 0xFF.
510    ///
511    /// # Returns
512    ///
513    /// True if the string is UTF-16BE encoded, false otherwise
514    pub(crate) fn is_utf16be(&self) -> bool {
515        if self.kind == PDFStrKind::Literal {
516            return false;
517        }
518        self.buf.starts_with(b"\xFE\xFF")
519    }
520}