pdf_rs/objects.rs
1use std::collections::HashMap;
2use crate::constants::FILTER;
3
4/// Type alias for an object reference tuple containing object number and generation number.
5pub type ObjRefTuple = (u32, u16);
6
7/// Represents a numeric value in a PDF document.
8///
9/// PDF supports three types of numbers: signed integers, unsigned integers, and real numbers.
10#[derive(PartialEq, Clone)]
11pub enum PDFNumber {
12 /// A signed integer value.
13 Signed(i64),
14 /// An unsigned integer value.
15 Unsigned(u64),
16 /// A real (floating-point) value.
17 Real(f64),
18}
19
20/// Represents a cross-reference table entry.
21///
22/// XRef entries map object numbers to their file positions and track whether objects are in use.
23#[derive(Clone)]
24pub struct XEntry {
25 /// The value of the entry.
26 pub(crate) value: u64,
27 /// The entry is either in use or deleted.
28 pub(crate) using: bool,
29 /// The object number of the entry.
30 pub(crate) obj_num: u32,
31 /// The generation number of the entry.
32 pub(crate) gen_num: u16,
33}
34
35/// Represents a PDF dictionary object.
36///
37/// Dictionaries are associative tables containing key-value pairs where keys are names
38/// and values can be any PDF object type.
39pub struct Dictionary {
40 entries: HashMap<String, PDFObject>,
41}
42
43/// Represents a PDF stream object.
44///
45/// Streams contain large amounts of data (like images or page content) with associated metadata.
46pub struct Stream {
47 buf: Vec<u8>,
48 metadata: Dictionary,
49}
50
51/// Represents the kind of PDF string encoding.
52#[derive(PartialEq)]
53pub(crate) enum PDFStrKind {
54 /// Literal string enclosed in parentheses.
55 Literal,
56 /// Hexadecimal string enclosed in angle brackets.
57 Hexadecimal,
58}
59
60/// Represents a PDF string object.
61///
62/// Strings can be either literal or hexadecimal encoded.
63pub struct PDFString {
64 kind: PDFStrKind,
65 buf: Vec<u8>,
66}
67
68pub enum PDFObject {
69 /// The keywords true and false represent boolean objects with values true and false.
70 Bool(bool),
71 /// ## Numbers
72 /// PDF provides two types of numbers, integer and real. Integers may be specified by
73 /// signed or unsigned constants. Reals may only be in decimal format. Throughout
74 /// this book, number means an object whose type is either integer or real.</br>
75 /// `Note Exponential format for numbers (such as 1.0E3) is not supported.`
76 Number(PDFNumber),
77 /// ## Names
78 /// A name, like a string, is a sequence of characters. It must begin with a slash fol-
79 /// lowed by a letter, followed by a sequence of characters. Names may contain any
80 /// characters except linefeed, carriage return, %, (, ), <, >, [, ], {, and }. Examples of
81 /// names are:
82 /// ```plaintext
83 /// /Name1
84 /// /ASomewhatLongerName2
85 /// /A;Name_With-various***characters?.
86 /// ```
87 Named(String),
88 String(PDFString),
89 /// ## Arrays
90 /// An array is a sequence of PDF objects. An array may contain a mixture of object
91 /// types. An array is represented as a left square bracket ( [ ), followed by a sequence
92 /// of objects, followed by a right square bracket ( ] ). An example of an array is:</br>
93 /// ```plaintext
94 /// [ 0 (Higgs) false 3.14 3 549 /SomeName ]
95 /// ```
96 Array(Vec<PDFObject>),
97 /// A dictionary is an associative table containing pairs of objects. The first element of
98 /// each pair is called the key and the second element is called the value. Unlike dictio-
99 /// naries in the PostScript language, a key must
100 /// be a name. A value can be any kind of object, including a dictionary.
101 /// A dictionary is generally used to collect and tie together the attributes of a complex
102 /// object, with each key–value pair specifying the name and value of an attribute.
103 ///
104 /// A dictionary is represented by two left angle brackets (<<), followed by a sequence
105 /// of key–value pairs, followed by two right angle brackets (>>). For example:
106 /// Example 4.1 Dictionary
107 /// << /Type /Example /Key2 12 /Key3 (a string) >>
108 /// Or, in an example of a dictionary within a dictionary:
109 /// ```plaintext
110 /// << /Type /AlsoAnExample
111 /// /Subtype /Bad
112 /// /Reason (unsure)
113 /// /Version 0.01
114 /// /MyInfo <<
115 /// /Item1 0.4
116 /// /Item2 true
117 /// /LastItem (not!)
118 /// /VeryLastItem (OK)
119 /// >>
120 /// >>
121 /// ```
122 /// Dictionary objects are the main building blocks of a PDF document. Many parts of
123 /// a PDF document, such as pages and fonts, are represented using dictionaries. By
124 /// convention, the **Type** key of such a dictionary specifies the type of object being
125 /// described by the dictionary. Its value is always a name. In some cases, the **Subtype**
126 /// key is used to describe a specialization of a particular type. Its value is always a
127 /// name. For a font, Type is **Font** and four Subtypes exist: Type1, MMType1,
128 /// Type3, and TrueType.
129 Dict(Dictionary),
130 Null,
131 /// Any object used as an element of an array or as a value in a dictionary may be
132 /// specified by either a direct object or an indirect reference. An indirect reference is a
133 /// reference to an indirect object, and consists of the indirect object’s object number,
134 /// generation number, and the **R** keyword:
135 /// ```plaintext
136 /// <indirect reference> ::=
137 /// <object number>
138 /// <generation number>
139 /// R
140 /// ```
141 /// Using an indirect reference to the stream’s length, a stream could be written as:
142 /// ```plaintext
143 /// 7 0 obj
144 /// <<
145 /// /Length 8 0 R
146 /// >>
147 /// stream
148 /// BT
149 /// /F1 12 Tf
150 /// 72 712 Td (A stream with an indirect Length) Tj
151 /// ET
152 /// endstream
153 /// endobj
154 /// 8 0 obj
155 /// 64
156 /// endobj
157 /// ```
158 ObjectRef(u32, u16),
159 /// A direct object is a boolean, number, string, name, array, dictionary, stream, or null,
160 /// as described in the previous sections. An indirect object is an object that has been
161 /// labeled so that it can be referenced by other objects. Any type of object may be an
162 /// indirect object. Indirect objects are very useful; for example, if the length of a
163 /// stream is not known before it is written, the value of the stream’s **Length** key may
164 /// be specified as an indirect object that is stored in the file after the stream.</br>
165 /// An indirect object consists of an object identifier, a direct object, and the **endobj**
166 /// keyword. The object identifier consists of an integer object number, an integer gen-
167 /// eration number, and the **obj** keyword:
168 /// ```plaintext
169 /// <indirect object> ::=
170 /// <object ID> ::=
171 /// <object ID>
172 /// <direct object>
173 /// endobj
174 /// <object number>
175 /// <generation number>
176 /// obj
177 /// ```
178 /// The combination of object number and generation number serves as a unique iden-
179 /// tifier for an indirect object. Throughout its existence, an indirect object retains the
180 /// object number and generation number it was initially assigned, even if the object is
181 /// modified.</br>
182 /// Each indirect object has a unique object number, and indirect objects are often but
183 /// not necessarily numbered sequentially in the file, beginning with o
184 IndirectObject(u32, u16, Box<PDFObject>),
185 /// ## Streams
186 /// A stream, like a string, is a sequence of characters. However, an application can
187 /// read a small portion of a stream at a time, while a string must be read in its entirety.
188 /// For this reason, objects with potentially large amounts of data, such as images and
189 /// page descriptions, are represented as streams.
190 ///
191 /// A stream consists of a dictionary that describes a sequence of characters, followed
192 /// by the keyword stream, followed by one or more lines of characters, followed by
193 /// the keyword endstream.
194 /// ```plaintext
195 /// <stream> ::= <dictionary>
196 /// stream
197 /// {<lines of characters>}*
198 /// endstream
199 /// ```
200 Stream(Stream),
201}
202
203impl PDFObject {
204 /// Returns true if the object is a boolean.
205 pub fn is_bool(&self) -> bool {
206 match self {
207 PDFObject::Bool(_) => true,
208 _ => false,
209 }
210 }
211 /// Returns the boolean value of the object if it is a boolean.
212 pub fn as_bool(&self) -> Option<bool> {
213 match self {
214 PDFObject::Bool(b) => Some(*b),
215 _ => None,
216 }
217 }
218
219 /// Returns true if the object is a number.
220 pub fn is_number(&self) -> bool {
221 match self {
222 PDFObject::Number(_) => true,
223 _ => false,
224 }
225 }
226 /// Returns the number value of the object if it is a number.
227 pub fn as_number(&self) -> Option<&PDFNumber> {
228 match self {
229 PDFObject::Number(n) => Some(n),
230 _ => None,
231 }
232 }
233 /// Returns true if the object is a string.
234 pub fn is_string(&self) -> bool {
235 match self {
236 PDFObject::String(_) => true,
237 _ => false,
238 }
239 }
240
241 /// Returns the string value of the object if it is a string.
242 pub fn as_string(&self) -> Option<&PDFString> {
243 match self {
244 PDFObject::String(s) => Some(s),
245 _ => None,
246 }
247 }
248
249 /// Returns the string value of the object if it is a string.
250 pub fn is_array(&self) -> bool {
251 match self {
252 PDFObject::Array(_) => true,
253 _ => false,
254 }
255 }
256 /// Returns the array of objects if it is an array.
257 pub fn as_array(&self) -> Option<&[PDFObject]> {
258 match self {
259 PDFObject::Array(a) => Some(a),
260 _ => None,
261 }
262 }
263 /// Returns true if the object is a dictionary.
264 pub fn is_dict(&self) -> bool {
265 match self {
266 PDFObject::Dict(_) => true,
267 _ => false,
268 }
269 }
270 /// Returns the dictionary if it is one.
271 pub fn as_dict(&self) -> Option<&Dictionary> {
272 match self {
273 PDFObject::Dict(d) => Some(d),
274 _ => None,
275 }
276 }
277 /// Returns the dictionary if it is one.
278 pub fn to_dict(self) -> Option<Dictionary> {
279 match self {
280 PDFObject::Dict(d) => Some(d),
281 _ => None,
282 }
283 }
284 /// Returns true if the object is an indirect object.
285 pub fn is_object_ref(&self) -> bool {
286 match self {
287 PDFObject::ObjectRef(_, ..) => true,
288 _ => false,
289 }
290 }
291 /// Returns the object reference if it is one.
292 pub fn as_object_ref(&self) -> Option<(u32, u16)> {
293 match self {
294 PDFObject::ObjectRef(n, g) => Some((*n, *g)),
295 _ => None,
296 }
297 }
298
299 /// Returns true if the object is an indirect object.
300 pub fn is_indirect_object(&self) -> bool {
301 match self {
302 PDFObject::IndirectObject(_, _, _) => true,
303 _ => false,
304 }
305 }
306 /// Returns the indirect object if it is one.
307 pub fn as_indirect_object(&self) -> Option<(u32, u16, &PDFObject)> {
308 match self {
309 PDFObject::IndirectObject(n, g, data) => Some((*n, *g, data)),
310 _ => None,
311 }
312 }
313
314 /// Returns true if the object is null.
315 pub fn is_null(&self) -> bool {
316 match self {
317 PDFObject::Null => true,
318 _ => false,
319 }
320 }
321 /// Returns true if the object is a stream.
322 pub fn is_stream(&self)->bool{
323 match self {
324 PDFObject::Stream(_) => true,
325 _ => false,
326 }
327 }
328
329 /// Returns the stream if it is one.
330 pub fn as_stream(&self)->Option<&Stream>{
331 match self {
332 PDFObject::Stream(s) => Some(s),
333 _ => None,
334 }
335 }
336 /// Returns true if the object is a name.
337 pub fn is_name(&self)->bool{
338 match self {
339 PDFObject::Named(_) => true,
340 _ => false,
341 }
342 }
343 /// Returns the name if it is one.
344 pub fn as_name(&self)->Option<&String>{
345 match self {
346 PDFObject::Named(s) => Some(s),
347 _ => None,
348 }
349 }
350
351}
352
353impl Dictionary {
354 /// Creates a new dictionary with the given entries.
355 pub(crate) fn new(entries: HashMap<String, PDFObject>) -> Self {
356 Dictionary { entries }
357 }
358 /// Returns the value of the entry with the given key.
359 pub fn get(&self, key: &str)-> Option<&PDFObject> {
360 self.entries.get(key)
361 }
362
363 /// Removes the entry with the given key.
364 pub fn remove(&mut self,key:&str)->Option<PDFObject>{
365 self.entries.remove(key)
366 }
367 /// Returns true if the dictionary contains the given key.
368 pub fn contain(&self, key: &str)->bool{
369 self.entries.contains_key(key)
370 }
371
372 /// Returns the value of the entry with the given key as a name.
373 pub fn get_named_value(&self, key: &str) -> Option<&String> {
374 self.get(key).and_then(|it| it.as_name())
375 }
376
377
378 /// Returns the value of the entry with the given key as a u64.
379 pub fn get_u64_num(&self, key: &str) -> Option<u64> {
380 self.get(key)
381 .and_then(|it| it.as_number())
382 .and_then(|it| if let PDFNumber::Unsigned(num) = it { Some(*num) } else { None })
383 }
384
385 /// Returns true if the value of the entry with the given key is the given name.
386 pub fn named_value_was(&self, keys: &str,except:&str) -> bool {
387 if let Some(value) = self.get_named_value(keys) {
388 value == except
389 } else {
390 false
391 }
392 }
393
394 /// Returns the value of the entry with the given key as an array.
395 pub fn get_array_value(&self, key: &str) -> Option<&[PDFObject]> {
396 self.get(key).and_then(|it| it.as_array())
397 }
398}
399
400impl XEntry {
401 pub(crate) fn new(obj_num: u32, gen_num: u16, value: u64, using: bool) -> Self {
402 XEntry {
403 obj_num,
404 gen_num,
405 using,
406 value,
407 }
408 }
409 /// Returns the object number of the entry.
410 pub fn get_obj_num(&self)->u32{
411 self.obj_num
412 }
413 /// Returns the generation number of the entry.
414 pub fn get_gen_num(&self)->u16{
415 self.gen_num
416 }
417 /// Returns true if the entry is currently being used.
418 pub fn is_using(&self) -> bool {
419 self.using
420 }
421
422 /// Returns true if the entry is freed.
423 pub fn is_freed(&self)->bool{
424 !self.using
425 }
426 /// Returns the value of the entry.
427 pub fn get_value(&self)->u64{
428 self.value
429 }
430}
431
432impl Stream {
433 /// Creates a new stream with the given metadata and buffer.
434 ///
435 /// # Arguments
436 ///
437 /// * `metadata` - A dictionary containing stream metadata
438 /// * `buf` - The byte buffer containing the stream data
439 ///
440 /// # Returns
441 ///
442 /// A new `Stream` instance
443 pub(crate) fn new(metadata: Dictionary,buf:Vec<u8>) -> Self {
444 Stream { buf, metadata }
445 }
446
447 /// Returns a slice reference to the stream's byte buffer.
448 ///
449 /// # Returns
450 ///
451 /// A slice reference to the internal byte buffer
452 pub(crate) fn as_slice(&self) -> &[u8] {
453 &self.buf
454 }
455
456
457 pub(crate) fn get_filters(&self) -> Vec<String> {
458 match self.metadata.get(FILTER){
459 Some(PDFObject::Array(arr)) => {
460 arr.iter()
461 .filter_map(|it| it.as_name())
462 .map(|it| it.clone())
463 .collect()
464 }
465 Some(PDFObject::Named(name)) => {
466 vec![name.clone()]
467 }
468 _ => vec![]
469 }
470 }
471}
472
473impl PDFString {
474 /// Creates a new PDF string with the specified kind and buffer.
475 ///
476 /// # Arguments
477 ///
478 /// * `kind` - The encoding kind of the string (Literal or Hexadecimal)
479 /// * `buf` - The byte buffer containing the string data
480 ///
481 /// # Returns
482 ///
483 /// A new `PDFString` instance
484 pub(crate) fn new(kind: PDFStrKind, buf: Vec<u8>) -> Self {
485 PDFString { kind, buf }
486 }
487
488 /// Returns a reference to the string's byte buffer.
489 ///
490 /// # Returns
491 ///
492 /// A reference to the internal byte buffer
493 pub(crate) fn get_buf(&self) -> &Vec<u8> {
494 &self.buf
495 }
496
497 /// Returns the encoding kind of the string.
498 ///
499 /// # Returns
500 ///
501 /// A reference to the `PDFStrKind` indicating the encoding type
502 pub(crate) fn get_kind(&self) -> &PDFStrKind {
503 &self.kind
504 }
505
506 /// Returns true if the string is in UTF-16BE encoding.
507 ///
508 /// This checks if the string is hexadecimal encoded and starts with the
509 /// UTF-16BE byte order mark (BOM) 0xFE 0xFF.
510 ///
511 /// # Returns
512 ///
513 /// True if the string is UTF-16BE encoded, false otherwise
514 pub(crate) fn is_utf16be(&self) -> bool {
515 if self.kind == PDFStrKind::Literal {
516 return false;
517 }
518 self.buf.starts_with(b"\xFE\xFF")
519 }
520}