oxidize_pdf/parser/
objects.rs

1//! PDF Object Parser - Core PDF data types and parsing
2//!
3//! This module implements parsing of all PDF object types according to ISO 32000-1 Section 7.3.
4//! PDF files are built from a small set of basic object types that can be combined to form
5//! complex data structures.
6//!
7//! # Object Types
8//!
9//! PDF supports the following basic object types:
10//! - **Null**: Represents an undefined value
11//! - **Boolean**: true or false
12//! - **Integer**: Whole numbers
13//! - **Real**: Floating-point numbers
14//! - **String**: Text data (literal or hexadecimal)
15//! - **Name**: Unique atomic symbols (e.g., /Type, /Pages)
16//! - **Array**: Ordered collections of objects
17//! - **Dictionary**: Key-value mappings where keys are names
18//! - **Stream**: Dictionary + binary data
19//! - **Reference**: Indirect reference to another object
20//!
21//! # Example
22//!
23//! ```rust
24//! use oxidize_pdf::parser::objects::{PdfObject, PdfDictionary, PdfName, PdfArray};
25//!
26//! // Create a simple page dictionary
27//! let mut dict = PdfDictionary::new();
28//! dict.insert("Type".to_string(), PdfObject::Name(PdfName::new("Page".to_string())));
29//! dict.insert("MediaBox".to_string(), PdfObject::Array(PdfArray::new()));
30//!
31//! // Check dictionary type
32//! assert_eq!(dict.get_type(), Some("Page"));
33//! ```
34
35use super::lexer::{Lexer, Token};
36use super::{ParseError, ParseResult};
37use std::collections::HashMap;
38use std::io::Read;
39
40/// PDF Name object - Unique atomic symbols in PDF.
41///
42/// Names are used as keys in dictionaries and to identify various PDF constructs.
43/// They are written with a leading slash (/) in PDF syntax but stored without it.
44///
45/// # Examples
46///
47/// Common PDF names:
48/// - `/Type` - Object type identifier
49/// - `/Pages` - Page tree root
50/// - `/Font` - Font resource
51/// - `/MediaBox` - Page dimensions
52///
53/// ```rust
54/// use oxidize_pdf::parser::objects::PdfName;
55///
56/// let name = PdfName::new("Type".to_string());
57/// assert_eq!(name.as_str(), "Type");
58/// ```
59#[derive(Debug, Clone, PartialEq, Eq, Hash)]
60pub struct PdfName(pub String);
61
62/// PDF String object - Text data in PDF files.
63///
64/// PDF strings can contain arbitrary binary data and use various encodings.
65/// They can be written as literal strings `(text)` or hexadecimal strings `<48656C6C6F>`.
66///
67/// # Encoding
68///
69/// String encoding depends on context:
70/// - Text strings: Usually PDFDocEncoding or UTF-16BE
71/// - Font strings: Encoding specified by the font
72/// - Binary data: No encoding, raw bytes
73///
74/// # Example
75///
76/// ```rust
77/// use oxidize_pdf::parser::objects::PdfString;
78///
79/// // Create from UTF-8
80/// let string = PdfString::new(b"Hello World".to_vec());
81///
82/// // Try to decode as UTF-8
83/// if let Ok(text) = string.as_str() {
84///     println!("Text: {}", text);
85/// }
86/// ```
87#[derive(Debug, Clone, PartialEq)]
88pub struct PdfString(pub Vec<u8>);
89
90/// PDF Array object - Ordered collection of PDF objects.
91///
92/// Arrays can contain any PDF object type, including other arrays and dictionaries.
93/// They are written in PDF syntax as `[item1 item2 ... itemN]`.
94///
95/// # Common Uses
96///
97/// - Rectangle specifications: `[llx lly urx ury]`
98/// - Color values: `[r g b]`
99/// - Matrix transformations: `[a b c d e f]`
100/// - Resource lists
101///
102/// # Example
103///
104/// ```rust
105/// use oxidize_pdf::parser::objects::{PdfArray, PdfObject};
106///
107/// // Create a MediaBox array [0 0 612 792]
108/// let mut media_box = PdfArray::new();
109/// media_box.push(PdfObject::Integer(0));
110/// media_box.push(PdfObject::Integer(0));
111/// media_box.push(PdfObject::Integer(612));
112/// media_box.push(PdfObject::Integer(792));
113///
114/// assert_eq!(media_box.len(), 4);
115/// ```
116#[derive(Debug, Clone, PartialEq)]
117pub struct PdfArray(pub Vec<PdfObject>);
118
119/// PDF Dictionary object - Key-value mapping with name keys.
120///
121/// Dictionaries are the primary way to represent complex data structures in PDF.
122/// Keys must be PdfName objects, values can be any PDF object type.
123///
124/// # Common Dictionary Types
125///
126/// - **Catalog**: Document root (`/Type /Catalog`)
127/// - **Page**: Individual page (`/Type /Page`)
128/// - **Font**: Font definition (`/Type /Font`)
129/// - **Stream**: Binary data with metadata
130///
131/// # Example
132///
133/// ```rust
134/// use oxidize_pdf::parser::objects::{PdfDictionary, PdfObject, PdfName};
135///
136/// let mut page_dict = PdfDictionary::new();
137/// page_dict.insert("Type".to_string(),
138///     PdfObject::Name(PdfName::new("Page".to_string())));
139/// page_dict.insert("Parent".to_string(),
140///     PdfObject::Reference(2, 0)); // Reference to pages tree
141///
142/// // Access values
143/// assert_eq!(page_dict.get_type(), Some("Page"));
144/// assert!(page_dict.contains_key("Parent"));
145/// ```
146#[derive(Debug, Clone, PartialEq)]
147pub struct PdfDictionary(pub HashMap<PdfName, PdfObject>);
148
149/// PDF Stream object - Dictionary with associated binary data.
150///
151/// Streams are used for large data blocks like page content, images, fonts, etc.
152/// The dictionary describes the stream's properties (length, filters, etc.).
153///
154/// # Structure
155///
156/// - `dict`: Stream dictionary with metadata
157/// - `data`: Raw stream bytes (possibly compressed)
158///
159/// # Common Stream Types
160///
161/// - **Content streams**: Page drawing instructions
162/// - **Image XObjects**: Embedded images
163/// - **Font programs**: Embedded font data
164/// - **Form XObjects**: Reusable graphics
165///
166/// # Example
167///
168/// ```rust
169/// use oxidize_pdf::parser::objects::{PdfStream, PdfDictionary};
170///
171/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
172/// # let stream = PdfStream { dict: PdfDictionary::new(), data: vec![] };
173/// // Get decompressed data
174/// let decoded = stream.decode()?;
175/// println!("Decoded {} bytes", decoded.len());
176///
177/// // Access raw data
178/// let raw = stream.raw_data();
179/// println!("Raw {} bytes", raw.len());
180/// # Ok(())
181/// # }
182/// ```
183#[derive(Debug, Clone, PartialEq)]
184pub struct PdfStream {
185    /// Stream dictionary containing Length, Filter, and other properties
186    pub dict: PdfDictionary,
187    /// Raw stream data (may be compressed)
188    pub data: Vec<u8>,
189}
190
191impl PdfStream {
192    /// Get the decompressed stream data.
193    ///
194    /// Automatically applies filters specified in the stream dictionary
195    /// (FlateDecode, ASCIIHexDecode, etc.) to decompress the data.
196    ///
197    /// # Returns
198    ///
199    /// The decoded/decompressed stream bytes.
200    ///
201    /// # Errors
202    ///
203    /// Returns an error if:
204    /// - Unknown filter is specified
205    /// - Decompression fails
206    /// - Filter parameters are invalid
207    ///
208    /// # Example
209    ///
210    /// ```rust,no_run
211    /// # use oxidize_pdf::parser::objects::PdfStream;
212    /// # fn example(stream: &PdfStream) -> Result<(), Box<dyn std::error::Error>> {
213    /// match stream.decode() {
214    ///     Ok(data) => println!("Decoded {} bytes", data.len()),
215    ///     Err(e) => println!("Decode error: {}", e),
216    /// }
217    /// # Ok(())
218    /// # }
219    /// ```
220    pub fn decode(&self) -> ParseResult<Vec<u8>> {
221        super::filters::decode_stream(&self.data, &self.dict)
222    }
223
224    /// Get the raw (possibly compressed) stream data.
225    ///
226    /// Returns the stream data exactly as stored in the PDF file,
227    /// without applying any filters or decompression.
228    ///
229    /// # Example
230    ///
231    /// ```rust
232    /// # use oxidize_pdf::parser::objects::PdfStream;
233    /// # let stream = PdfStream { dict: Default::default(), data: vec![1, 2, 3] };
234    /// let raw_data = stream.raw_data();
235    /// println!("Raw stream: {} bytes", raw_data.len());
236    /// ```
237    pub fn raw_data(&self) -> &[u8] {
238        &self.data
239    }
240}
241
242/// PDF Object types - The fundamental data types in PDF.
243///
244/// All data in a PDF file is represented using these basic types.
245/// Objects can be direct (embedded) or indirect (referenced).
246///
247/// # Object Types
248///
249/// - `Null` - Undefined/absent value
250/// - `Boolean` - true or false
251/// - `Integer` - Signed integers
252/// - `Real` - Floating-point numbers
253/// - `String` - Text or binary data
254/// - `Name` - Atomic symbols like /Type
255/// - `Array` - Ordered collections
256/// - `Dictionary` - Key-value maps
257/// - `Stream` - Dictionary + binary data
258/// - `Reference` - Indirect object reference (num gen R)
259///
260/// # Example
261///
262/// ```rust
263/// use oxidize_pdf::parser::objects::{PdfObject, PdfName, PdfString};
264///
265/// // Different object types
266/// let null = PdfObject::Null;
267/// let bool_val = PdfObject::Boolean(true);
268/// let int_val = PdfObject::Integer(42);
269/// let real_val = PdfObject::Real(3.14159);
270/// let name = PdfObject::Name(PdfName::new("Type".to_string()));
271/// let reference = PdfObject::Reference(10, 0); // 10 0 R
272///
273/// // Type checking
274/// assert!(int_val.as_integer().is_some());
275/// assert_eq!(int_val.as_integer(), Some(42));
276/// ```
277#[derive(Debug, Clone, PartialEq)]
278pub enum PdfObject {
279    /// Null object - represents undefined or absent values
280    Null,
281    /// Boolean value - true or false
282    Boolean(bool),
283    /// Integer number
284    Integer(i64),
285    /// Real (floating-point) number
286    Real(f64),
287    /// String data (literal or hexadecimal)
288    String(PdfString),
289    /// Name object - unique identifier
290    Name(PdfName),
291    /// Array - ordered collection of objects
292    Array(PdfArray),
293    /// Dictionary - unordered key-value pairs
294    Dictionary(PdfDictionary),
295    /// Stream - dictionary with binary data
296    Stream(PdfStream),
297    /// Indirect object reference (object_number, generation_number)
298    Reference(u32, u16),
299}
300
301impl PdfObject {
302    /// Parse a PDF object from a lexer.
303    ///
304    /// Reads tokens from the lexer and constructs the appropriate PDF object.
305    /// Handles all PDF object types including indirect references.
306    ///
307    /// # Arguments
308    ///
309    /// * `lexer` - Token source for parsing
310    ///
311    /// # Returns
312    ///
313    /// The parsed PDF object.
314    ///
315    /// # Errors
316    ///
317    /// Returns an error if:
318    /// - Invalid syntax is encountered
319    /// - Unexpected end of input
320    /// - Malformed object structure
321    ///
322    /// # Example
323    ///
324    /// ```rust,no_run
325    /// use oxidize_pdf::parser::lexer::Lexer;
326    /// use oxidize_pdf::parser::objects::PdfObject;
327    /// use std::io::Cursor;
328    ///
329    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
330    /// let input = b"42";
331    /// let mut lexer = Lexer::new(Cursor::new(input));
332    /// let obj = PdfObject::parse(&mut lexer)?;
333    /// assert_eq!(obj, PdfObject::Integer(42));
334    /// # Ok(())
335    /// # }
336    /// ```
337    pub fn parse<R: Read>(lexer: &mut Lexer<R>) -> ParseResult<Self> {
338        let token = lexer.next_token()?;
339        Self::parse_from_token(lexer, token)
340    }
341
342    /// Parse a PDF object starting from a specific token
343    fn parse_from_token<R: Read>(lexer: &mut Lexer<R>, token: Token) -> ParseResult<Self> {
344        match token {
345            Token::Null => Ok(PdfObject::Null),
346            Token::Boolean(b) => Ok(PdfObject::Boolean(b)),
347            Token::Integer(i) => {
348                // For negative numbers or large values, don't check for references
349                if !(0..=9999999).contains(&i) {
350                    return Ok(PdfObject::Integer(i));
351                }
352
353                // Check if this is part of a reference (e.g., "1 0 R")
354                match lexer.next_token()? {
355                    Token::Integer(gen) if (0..=65535).contains(&gen) => {
356                        // Might be a reference, check for 'R'
357                        match lexer.next_token()? {
358                            Token::Name(s) if s == "R" => {
359                                Ok(PdfObject::Reference(i as u32, gen as u16))
360                            }
361                            token => {
362                                // Not a reference, push back the tokens
363                                lexer.push_token(token);
364                                lexer.push_token(Token::Integer(gen));
365                                Ok(PdfObject::Integer(i))
366                            }
367                        }
368                    }
369                    token => {
370                        // Not a reference, just an integer
371                        lexer.push_token(token);
372                        Ok(PdfObject::Integer(i))
373                    }
374                }
375            }
376            Token::Real(r) => Ok(PdfObject::Real(r)),
377            Token::String(s) => Ok(PdfObject::String(PdfString(s))),
378            Token::Name(n) => Ok(PdfObject::Name(PdfName(n))),
379            Token::ArrayStart => Self::parse_array(lexer),
380            Token::DictStart => Self::parse_dictionary_or_stream(lexer),
381            Token::Comment(_) => {
382                // Skip comments and parse next object
383                Self::parse(lexer)
384            }
385            Token::StartXRef => {
386                // This is a PDF structure marker, not a parseable object
387                Err(ParseError::SyntaxError {
388                    position: 0,
389                    message: "StartXRef encountered - this is not a PDF object".to_string(),
390                })
391            }
392            Token::Eof => Err(ParseError::SyntaxError {
393                position: 0,
394                message: "Unexpected end of file".to_string(),
395            }),
396            _ => Err(ParseError::UnexpectedToken {
397                expected: "PDF object".to_string(),
398                found: format!("{token:?}"),
399            }),
400        }
401    }
402
403    /// Parse a PDF array
404    fn parse_array<R: Read>(lexer: &mut Lexer<R>) -> ParseResult<Self> {
405        let mut elements = Vec::new();
406
407        loop {
408            let token = lexer.next_token()?;
409            match token {
410                Token::ArrayEnd => break,
411                Token::Comment(_) => continue, // Skip comments
412                _ => {
413                    let obj = Self::parse_from_token(lexer, token)?;
414                    elements.push(obj);
415                }
416            }
417        }
418
419        Ok(PdfObject::Array(PdfArray(elements)))
420    }
421
422    /// Parse a PDF dictionary and check if it's followed by a stream
423    fn parse_dictionary_or_stream<R: Read>(lexer: &mut Lexer<R>) -> ParseResult<Self> {
424        let dict = Self::parse_dictionary_inner(lexer)?;
425
426        // Check if this is followed by a stream
427        loop {
428            let token = lexer.next_token()?;
429            // Check for stream
430            match token {
431                Token::Stream => {
432                    // Parse stream data
433                    let stream_data = Self::parse_stream_data(lexer, &dict)?;
434                    return Ok(PdfObject::Stream(PdfStream {
435                        dict,
436                        data: stream_data,
437                    }));
438                }
439                Token::Comment(_) => {
440                    // Skip comment and continue checking
441                    continue;
442                }
443                Token::StartXRef => {
444                    // This is the end of the PDF structure, not a stream
445                    // Push the token back for later processing
446                    // Push back StartXRef token
447                    lexer.push_token(token);
448                    return Ok(PdfObject::Dictionary(dict));
449                }
450                _ => {
451                    // Not a stream, just a dictionary
452                    // Push the token back for later processing
453                    // Push back token
454                    lexer.push_token(token);
455                    return Ok(PdfObject::Dictionary(dict));
456                }
457            }
458        }
459    }
460
461    /// Parse the inner dictionary
462    fn parse_dictionary_inner<R: Read>(lexer: &mut Lexer<R>) -> ParseResult<PdfDictionary> {
463        let mut dict = HashMap::new();
464
465        loop {
466            let token = lexer.next_token()?;
467            match token {
468                Token::DictEnd => break,
469                Token::Comment(_) => continue, // Skip comments
470                Token::Name(key) => {
471                    let value = Self::parse(lexer)?;
472                    dict.insert(PdfName(key), value);
473                }
474                _ => {
475                    return Err(ParseError::UnexpectedToken {
476                        expected: "dictionary key (name) or >>".to_string(),
477                        found: format!("{token:?}"),
478                    });
479                }
480            }
481        }
482
483        Ok(PdfDictionary(dict))
484    }
485
486    /// Parse stream data
487    fn parse_stream_data<R: Read>(
488        lexer: &mut Lexer<R>,
489        dict: &PdfDictionary,
490    ) -> ParseResult<Vec<u8>> {
491        // Get the stream length from the dictionary
492        let length = dict
493            .0
494            .get(&PdfName("Length".to_string()))
495            .ok_or_else(|| ParseError::MissingKey("Length".to_string()))?;
496
497        let length = match length {
498            PdfObject::Integer(len) => *len as usize,
499            PdfObject::Reference(_, _) => {
500                // In a full implementation, we'd need to resolve this reference
501                // For now, we'll return an error
502                return Err(ParseError::SyntaxError {
503                    position: lexer.position(),
504                    message: "Stream length references not yet supported".to_string(),
505                });
506            }
507            _ => {
508                return Err(ParseError::SyntaxError {
509                    position: lexer.position(),
510                    message: "Invalid stream length type".to_string(),
511                });
512            }
513        };
514
515        // Skip the newline after 'stream' keyword
516        lexer.read_newline()?;
517
518        // Read the actual stream data
519        let stream_data = lexer.read_bytes(length)?;
520
521        // Skip optional whitespace before endstream
522        lexer.skip_whitespace()?;
523
524        // Read 'endstream' keyword
525        let token = lexer.next_token()?;
526        match token {
527            Token::EndStream => Ok(stream_data),
528            _ => Err(ParseError::UnexpectedToken {
529                expected: "endstream".to_string(),
530                found: format!("{token:?}"),
531            }),
532        }
533    }
534
535    /// Check if this object is null.
536    ///
537    /// # Example
538    ///
539    /// ```rust
540    /// use oxidize_pdf::parser::objects::PdfObject;
541    ///
542    /// assert!(PdfObject::Null.is_null());
543    /// assert!(!PdfObject::Integer(42).is_null());
544    /// ```
545    pub fn is_null(&self) -> bool {
546        matches!(self, PdfObject::Null)
547    }
548
549    /// Get the value as a boolean if this is a Boolean object.
550    ///
551    /// # Returns
552    ///
553    /// Some(bool) if this is a Boolean object, None otherwise.
554    ///
555    /// # Example
556    ///
557    /// ```rust
558    /// use oxidize_pdf::parser::objects::PdfObject;
559    ///
560    /// let obj = PdfObject::Boolean(true);
561    /// assert_eq!(obj.as_bool(), Some(true));
562    ///
563    /// let obj = PdfObject::Integer(1);
564    /// assert_eq!(obj.as_bool(), None);
565    /// ```
566    pub fn as_bool(&self) -> Option<bool> {
567        match self {
568            PdfObject::Boolean(b) => Some(*b),
569            _ => None,
570        }
571    }
572
573    /// Get as integer
574    pub fn as_integer(&self) -> Option<i64> {
575        match self {
576            PdfObject::Integer(i) => Some(*i),
577            _ => None,
578        }
579    }
580
581    /// Get the value as a real number.
582    ///
583    /// Returns the value for both Real and Integer objects,
584    /// converting integers to floating-point.
585    ///
586    /// # Returns
587    ///
588    /// Some(f64) if this is a numeric object, None otherwise.
589    ///
590    /// # Example
591    ///
592    /// ```rust
593    /// use oxidize_pdf::parser::objects::PdfObject;
594    ///
595    /// let real_obj = PdfObject::Real(3.14);
596    /// assert_eq!(real_obj.as_real(), Some(3.14));
597    ///
598    /// let int_obj = PdfObject::Integer(42);
599    /// assert_eq!(int_obj.as_real(), Some(42.0));
600    /// ```
601    pub fn as_real(&self) -> Option<f64> {
602        match self {
603            PdfObject::Real(r) => Some(*r),
604            PdfObject::Integer(i) => Some(*i as f64),
605            _ => None,
606        }
607    }
608
609    /// Get as string
610    pub fn as_string(&self) -> Option<&PdfString> {
611        match self {
612            PdfObject::String(s) => Some(s),
613            _ => None,
614        }
615    }
616
617    /// Get as name
618    pub fn as_name(&self) -> Option<&PdfName> {
619        match self {
620            PdfObject::Name(n) => Some(n),
621            _ => None,
622        }
623    }
624
625    /// Get as array
626    pub fn as_array(&self) -> Option<&PdfArray> {
627        match self {
628            PdfObject::Array(a) => Some(a),
629            _ => None,
630        }
631    }
632
633    /// Get as dictionary
634    pub fn as_dict(&self) -> Option<&PdfDictionary> {
635        match self {
636            PdfObject::Dictionary(d) => Some(d),
637            PdfObject::Stream(s) => Some(&s.dict),
638            _ => None,
639        }
640    }
641
642    /// Get as stream
643    pub fn as_stream(&self) -> Option<&PdfStream> {
644        match self {
645            PdfObject::Stream(s) => Some(s),
646            _ => None,
647        }
648    }
649
650    /// Get the object reference if this is a Reference object.
651    ///
652    /// # Returns
653    ///
654    /// Some((object_number, generation_number)) if this is a Reference, None otherwise.
655    ///
656    /// # Example
657    ///
658    /// ```rust
659    /// use oxidize_pdf::parser::objects::PdfObject;
660    ///
661    /// let obj = PdfObject::Reference(10, 0);
662    /// assert_eq!(obj.as_reference(), Some((10, 0)));
663    ///
664    /// // Use for resolving references
665    /// if let Some((obj_num, gen_num)) = obj.as_reference() {
666    ///     println!("Reference to {} {} R", obj_num, gen_num);
667    /// }
668    /// ```
669    pub fn as_reference(&self) -> Option<(u32, u16)> {
670        match self {
671            PdfObject::Reference(obj, gen) => Some((*obj, *gen)),
672            _ => None,
673        }
674    }
675}
676
677impl Default for PdfDictionary {
678    fn default() -> Self {
679        Self::new()
680    }
681}
682
683impl PdfDictionary {
684    /// Create a new empty dictionary.
685    ///
686    /// # Example
687    ///
688    /// ```rust
689    /// use oxidize_pdf::parser::objects::{PdfDictionary, PdfObject, PdfName};
690    ///
691    /// let mut dict = PdfDictionary::new();
692    /// dict.insert("Type".to_string(), PdfObject::Name(PdfName::new("Font".to_string())));
693    /// ```
694    pub fn new() -> Self {
695        PdfDictionary(HashMap::new())
696    }
697
698    /// Get a value by key name.
699    ///
700    /// # Arguments
701    ///
702    /// * `key` - The key name (without leading slash)
703    ///
704    /// # Returns
705    ///
706    /// Reference to the value if the key exists, None otherwise.
707    ///
708    /// # Example
709    ///
710    /// ```rust
711    /// use oxidize_pdf::parser::objects::{PdfDictionary, PdfObject};
712    ///
713    /// let mut dict = PdfDictionary::new();
714    /// dict.insert("Length".to_string(), PdfObject::Integer(1000));
715    ///
716    /// if let Some(length) = dict.get("Length").and_then(|o| o.as_integer()) {
717    ///     println!("Stream length: {}", length);
718    /// }
719    /// ```
720    pub fn get(&self, key: &str) -> Option<&PdfObject> {
721        self.0.get(&PdfName(key.to_string()))
722    }
723
724    /// Insert a key-value pair
725    pub fn insert(&mut self, key: String, value: PdfObject) {
726        self.0.insert(PdfName(key), value);
727    }
728
729    /// Check if dictionary contains a key
730    pub fn contains_key(&self, key: &str) -> bool {
731        self.0.contains_key(&PdfName(key.to_string()))
732    }
733
734    /// Get the dictionary type (value of /Type key).
735    ///
736    /// Many PDF dictionaries have a /Type entry that identifies their purpose.
737    ///
738    /// # Returns
739    ///
740    /// The type name if present, None otherwise.
741    ///
742    /// # Common Types
743    ///
744    /// - "Catalog" - Document catalog
745    /// - "Page" - Page object
746    /// - "Pages" - Page tree node
747    /// - "Font" - Font dictionary
748    /// - "XObject" - External object
749    ///
750    /// # Example
751    ///
752    /// ```rust
753    /// use oxidize_pdf::parser::objects::{PdfDictionary, PdfObject, PdfName};
754    ///
755    /// let mut dict = PdfDictionary::new();
756    /// dict.insert("Type".to_string(), PdfObject::Name(PdfName::new("Page".to_string())));
757    /// assert_eq!(dict.get_type(), Some("Page"));
758    /// ```
759    pub fn get_type(&self) -> Option<&str> {
760        self.get("Type")
761            .and_then(|obj| obj.as_name())
762            .map(|n| n.0.as_str())
763    }
764}
765
766impl Default for PdfArray {
767    fn default() -> Self {
768        Self::new()
769    }
770}
771
772impl PdfArray {
773    /// Create a new empty array
774    pub fn new() -> Self {
775        PdfArray(Vec::new())
776    }
777
778    /// Get array length
779    pub fn len(&self) -> usize {
780        self.0.len()
781    }
782
783    /// Check if array is empty
784    pub fn is_empty(&self) -> bool {
785        self.0.is_empty()
786    }
787
788    /// Get element at index.
789    ///
790    /// # Arguments
791    ///
792    /// * `index` - Zero-based index
793    ///
794    /// # Returns
795    ///
796    /// Reference to the element if index is valid, None otherwise.
797    ///
798    /// # Example
799    ///
800    /// ```rust
801    /// use oxidize_pdf::parser::objects::{PdfArray, PdfObject};
802    ///
803    /// let mut array = PdfArray::new();
804    /// array.push(PdfObject::Integer(10));
805    /// array.push(PdfObject::Integer(20));
806    ///
807    /// assert_eq!(array.get(0).and_then(|o| o.as_integer()), Some(10));
808    /// assert_eq!(array.get(1).and_then(|o| o.as_integer()), Some(20));
809    /// assert!(array.get(2).is_none());
810    /// ```
811    pub fn get(&self, index: usize) -> Option<&PdfObject> {
812        self.0.get(index)
813    }
814
815    /// Push an element
816    pub fn push(&mut self, obj: PdfObject) {
817        self.0.push(obj);
818    }
819}
820
821impl PdfString {
822    /// Create a new PDF string
823    pub fn new(data: Vec<u8>) -> Self {
824        PdfString(data)
825    }
826
827    /// Get as UTF-8 string if possible.
828    ///
829    /// Attempts to decode the string bytes as UTF-8.
830    /// Note that PDF strings may use other encodings.
831    ///
832    /// # Returns
833    ///
834    /// Ok(&str) if valid UTF-8, Err otherwise.
835    ///
836    /// # Example
837    ///
838    /// ```rust
839    /// use oxidize_pdf::parser::objects::PdfString;
840    ///
841    /// let string = PdfString::new(b"Hello".to_vec());
842    /// assert_eq!(string.as_str(), Ok("Hello"));
843    ///
844    /// let binary = PdfString::new(vec![0xFF, 0xFE]);
845    /// assert!(binary.as_str().is_err());
846    /// ```
847    pub fn as_str(&self) -> Result<&str, std::str::Utf8Error> {
848        std::str::from_utf8(&self.0)
849    }
850
851    /// Get as bytes
852    pub fn as_bytes(&self) -> &[u8] {
853        &self.0
854    }
855}
856
857impl PdfName {
858    /// Create a new PDF name
859    pub fn new(name: String) -> Self {
860        PdfName(name)
861    }
862
863    /// Get the name as a string
864    pub fn as_str(&self) -> &str {
865        &self.0
866    }
867}
868
869#[cfg(test)]
870mod tests {
871    use super::*;
872    use std::io::Cursor;
873
874    #[test]
875    fn test_parse_simple_objects() {
876        let input = b"null true false 123 -456 3.14 /Name (Hello)";
877        let mut lexer = Lexer::new(Cursor::new(input));
878
879        assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::Null);
880        assert_eq!(
881            PdfObject::parse(&mut lexer).unwrap(),
882            PdfObject::Boolean(true)
883        );
884        assert_eq!(
885            PdfObject::parse(&mut lexer).unwrap(),
886            PdfObject::Boolean(false)
887        );
888        assert_eq!(
889            PdfObject::parse(&mut lexer).unwrap(),
890            PdfObject::Integer(123)
891        );
892        assert_eq!(
893            PdfObject::parse(&mut lexer).unwrap(),
894            PdfObject::Integer(-456)
895        );
896        assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::Real(3.14));
897        assert_eq!(
898            PdfObject::parse(&mut lexer).unwrap(),
899            PdfObject::Name(PdfName("Name".to_string()))
900        );
901        assert_eq!(
902            PdfObject::parse(&mut lexer).unwrap(),
903            PdfObject::String(PdfString(b"Hello".to_vec()))
904        );
905    }
906
907    #[test]
908    fn test_parse_array() {
909        // Test simple array without potential references
910        let input = b"[100 200 300 /Name (test)]";
911        let mut lexer = Lexer::new(Cursor::new(input));
912
913        let obj = PdfObject::parse(&mut lexer).unwrap();
914        let array = obj.as_array().unwrap();
915
916        assert_eq!(array.len(), 5);
917        assert_eq!(array.get(0).unwrap().as_integer(), Some(100));
918        assert_eq!(array.get(1).unwrap().as_integer(), Some(200));
919        assert_eq!(array.get(2).unwrap().as_integer(), Some(300));
920        assert_eq!(array.get(3).unwrap().as_name().unwrap().as_str(), "Name");
921        assert_eq!(
922            array.get(4).unwrap().as_string().unwrap().as_bytes(),
923            b"test"
924        );
925    }
926
927    #[test]
928    fn test_parse_array_with_references() {
929        // Test array with references
930        let input = b"[1 0 R 2 0 R]";
931        let mut lexer = Lexer::new(Cursor::new(input));
932
933        let obj = PdfObject::parse(&mut lexer).unwrap();
934        let array = obj.as_array().unwrap();
935
936        assert_eq!(array.len(), 2);
937        assert!(array.get(0).unwrap().as_reference().is_some());
938        assert!(array.get(1).unwrap().as_reference().is_some());
939    }
940
941    #[test]
942    fn test_parse_dictionary() {
943        let input = b"<< /Type /Page /Parent 1 0 R /MediaBox [0 0 612 792] >>";
944        let mut lexer = Lexer::new(Cursor::new(input));
945
946        let obj = PdfObject::parse(&mut lexer).unwrap();
947        let dict = obj.as_dict().unwrap();
948
949        assert_eq!(dict.get_type(), Some("Page"));
950        assert!(dict.get("Parent").unwrap().as_reference().is_some());
951        assert!(dict.get("MediaBox").unwrap().as_array().is_some());
952    }
953}
oxidize_pdf/parser/objects.rs

oxidize_pdf/parser/
objects.rs