oxidize_pdf/parser/objects.rs
1//! PDF Object Parser - Core PDF data types and parsing
2//!
3//! This module implements parsing of all PDF object types according to ISO 32000-1 Section 7.3.
4//! PDF files are built from a small set of basic object types that can be combined to form
5//! complex data structures.
6//!
7//! # Object Types
8//!
9//! PDF supports the following basic object types:
10//! - **Null**: Represents an undefined value
11//! - **Boolean**: true or false
12//! - **Integer**: Whole numbers
13//! - **Real**: Floating-point numbers
14//! - **String**: Text data (literal or hexadecimal)
15//! - **Name**: Unique atomic symbols (e.g., /Type, /Pages)
16//! - **Array**: Ordered collections of objects
17//! - **Dictionary**: Key-value mappings where keys are names
18//! - **Stream**: Dictionary + binary data
19//! - **Reference**: Indirect reference to another object
20//!
21//! # Example
22//!
23//! ```rust
24//! use oxidize_pdf::parser::objects::{PdfObject, PdfDictionary, PdfName, PdfArray};
25//!
26//! // Create a simple page dictionary
27//! let mut dict = PdfDictionary::new();
28//! dict.insert("Type".to_string(), PdfObject::Name(PdfName::new("Page".to_string())));
29//! dict.insert("MediaBox".to_string(), PdfObject::Array(PdfArray::new()));
30//!
31//! // Check dictionary type
32//! assert_eq!(dict.get_type(), Some("Page"));
33//! ```
34
35use super::lexer::{Lexer, Token};
36use super::{ParseError, ParseResult};
37use std::collections::HashMap;
38use std::io::Read;
39
40/// PDF Name object - Unique atomic symbols in PDF.
41///
42/// Names are used as keys in dictionaries and to identify various PDF constructs.
43/// They are written with a leading slash (/) in PDF syntax but stored without it.
44///
45/// # Examples
46///
47/// Common PDF names:
48/// - `/Type` - Object type identifier
49/// - `/Pages` - Page tree root
50/// - `/Font` - Font resource
51/// - `/MediaBox` - Page dimensions
52///
53/// ```rust
54/// use oxidize_pdf::parser::objects::PdfName;
55///
56/// let name = PdfName::new("Type".to_string());
57/// assert_eq!(name.as_str(), "Type");
58/// ```
59#[derive(Debug, Clone, PartialEq, Eq, Hash)]
60pub struct PdfName(pub String);
61
62/// PDF String object - Text data in PDF files.
63///
64/// PDF strings can contain arbitrary binary data and use various encodings.
65/// They can be written as literal strings `(text)` or hexadecimal strings `<48656C6C6F>`.
66///
67/// # Encoding
68///
69/// String encoding depends on context:
70/// - Text strings: Usually PDFDocEncoding or UTF-16BE
71/// - Font strings: Encoding specified by the font
72/// - Binary data: No encoding, raw bytes
73///
74/// # Example
75///
76/// ```rust
77/// use oxidize_pdf::parser::objects::PdfString;
78///
79/// // Create from UTF-8
80/// let string = PdfString::new(b"Hello World".to_vec());
81///
82/// // Try to decode as UTF-8
83/// if let Ok(text) = string.as_str() {
84/// println!("Text: {}", text);
85/// }
86/// ```
87#[derive(Debug, Clone, PartialEq)]
88pub struct PdfString(pub Vec<u8>);
89
90/// PDF Array object - Ordered collection of PDF objects.
91///
92/// Arrays can contain any PDF object type, including other arrays and dictionaries.
93/// They are written in PDF syntax as `[item1 item2 ... itemN]`.
94///
95/// # Common Uses
96///
97/// - Rectangle specifications: `[llx lly urx ury]`
98/// - Color values: `[r g b]`
99/// - Matrix transformations: `[a b c d e f]`
100/// - Resource lists
101///
102/// # Example
103///
104/// ```rust
105/// use oxidize_pdf::parser::objects::{PdfArray, PdfObject};
106///
107/// // Create a MediaBox array [0 0 612 792]
108/// let mut media_box = PdfArray::new();
109/// media_box.push(PdfObject::Integer(0));
110/// media_box.push(PdfObject::Integer(0));
111/// media_box.push(PdfObject::Integer(612));
112/// media_box.push(PdfObject::Integer(792));
113///
114/// assert_eq!(media_box.len(), 4);
115/// ```
116#[derive(Debug, Clone, PartialEq)]
117pub struct PdfArray(pub Vec<PdfObject>);
118
119/// PDF Dictionary object - Key-value mapping with name keys.
120///
121/// Dictionaries are the primary way to represent complex data structures in PDF.
122/// Keys must be PdfName objects, values can be any PDF object type.
123///
124/// # Common Dictionary Types
125///
126/// - **Catalog**: Document root (`/Type /Catalog`)
127/// - **Page**: Individual page (`/Type /Page`)
128/// - **Font**: Font definition (`/Type /Font`)
129/// - **Stream**: Binary data with metadata
130///
131/// # Example
132///
133/// ```rust
134/// use oxidize_pdf::parser::objects::{PdfDictionary, PdfObject, PdfName};
135///
136/// let mut page_dict = PdfDictionary::new();
137/// page_dict.insert("Type".to_string(),
138/// PdfObject::Name(PdfName::new("Page".to_string())));
139/// page_dict.insert("Parent".to_string(),
140/// PdfObject::Reference(2, 0)); // Reference to pages tree
141///
142/// // Access values
143/// assert_eq!(page_dict.get_type(), Some("Page"));
144/// assert!(page_dict.contains_key("Parent"));
145/// ```
146#[derive(Debug, Clone, PartialEq)]
147pub struct PdfDictionary(pub HashMap<PdfName, PdfObject>);
148
149/// PDF Stream object - Dictionary with associated binary data.
150///
151/// Streams are used for large data blocks like page content, images, fonts, etc.
152/// The dictionary describes the stream's properties (length, filters, etc.).
153///
154/// # Structure
155///
156/// - `dict`: Stream dictionary with metadata
157/// - `data`: Raw stream bytes (possibly compressed)
158///
159/// # Common Stream Types
160///
161/// - **Content streams**: Page drawing instructions
162/// - **Image XObjects**: Embedded images
163/// - **Font programs**: Embedded font data
164/// - **Form XObjects**: Reusable graphics
165///
166/// # Example
167///
168/// ```rust
169/// use oxidize_pdf::parser::objects::{PdfStream, PdfDictionary};
170///
171/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
172/// # let stream = PdfStream { dict: PdfDictionary::new(), data: vec![] };
173/// // Get decompressed data
174/// let decoded = stream.decode()?;
175/// println!("Decoded {} bytes", decoded.len());
176///
177/// // Access raw data
178/// let raw = stream.raw_data();
179/// println!("Raw {} bytes", raw.len());
180/// # Ok(())
181/// # }
182/// ```
183#[derive(Debug, Clone, PartialEq)]
184pub struct PdfStream {
185 /// Stream dictionary containing Length, Filter, and other properties
186 pub dict: PdfDictionary,
187 /// Raw stream data (may be compressed)
188 pub data: Vec<u8>,
189}
190
191impl PdfStream {
192 /// Get the decompressed stream data.
193 ///
194 /// Automatically applies filters specified in the stream dictionary
195 /// (FlateDecode, ASCIIHexDecode, etc.) to decompress the data.
196 ///
197 /// # Returns
198 ///
199 /// The decoded/decompressed stream bytes.
200 ///
201 /// # Errors
202 ///
203 /// Returns an error if:
204 /// - Unknown filter is specified
205 /// - Decompression fails
206 /// - Filter parameters are invalid
207 ///
208 /// # Example
209 ///
210 /// ```rust,no_run
211 /// # use oxidize_pdf::parser::objects::PdfStream;
212 /// # fn example(stream: &PdfStream) -> Result<(), Box<dyn std::error::Error>> {
213 /// match stream.decode() {
214 /// Ok(data) => println!("Decoded {} bytes", data.len()),
215 /// Err(e) => println!("Decode error: {}", e),
216 /// }
217 /// # Ok(())
218 /// # }
219 /// ```
220 pub fn decode(&self) -> ParseResult<Vec<u8>> {
221 super::filters::decode_stream(&self.data, &self.dict)
222 }
223
224 /// Get the raw (possibly compressed) stream data.
225 ///
226 /// Returns the stream data exactly as stored in the PDF file,
227 /// without applying any filters or decompression.
228 ///
229 /// # Example
230 ///
231 /// ```rust
232 /// # use oxidize_pdf::parser::objects::PdfStream;
233 /// # let stream = PdfStream { dict: Default::default(), data: vec![1, 2, 3] };
234 /// let raw_data = stream.raw_data();
235 /// println!("Raw stream: {} bytes", raw_data.len());
236 /// ```
237 pub fn raw_data(&self) -> &[u8] {
238 &self.data
239 }
240}
241
242/// PDF Object types - The fundamental data types in PDF.
243///
244/// All data in a PDF file is represented using these basic types.
245/// Objects can be direct (embedded) or indirect (referenced).
246///
247/// # Object Types
248///
249/// - `Null` - Undefined/absent value
250/// - `Boolean` - true or false
251/// - `Integer` - Signed integers
252/// - `Real` - Floating-point numbers
253/// - `String` - Text or binary data
254/// - `Name` - Atomic symbols like /Type
255/// - `Array` - Ordered collections
256/// - `Dictionary` - Key-value maps
257/// - `Stream` - Dictionary + binary data
258/// - `Reference` - Indirect object reference (num gen R)
259///
260/// # Example
261///
262/// ```rust
263/// use oxidize_pdf::parser::objects::{PdfObject, PdfName, PdfString};
264///
265/// // Different object types
266/// let null = PdfObject::Null;
267/// let bool_val = PdfObject::Boolean(true);
268/// let int_val = PdfObject::Integer(42);
269/// let real_val = PdfObject::Real(3.14159);
270/// let name = PdfObject::Name(PdfName::new("Type".to_string()));
271/// let reference = PdfObject::Reference(10, 0); // 10 0 R
272///
273/// // Type checking
274/// assert!(int_val.as_integer().is_some());
275/// assert_eq!(int_val.as_integer(), Some(42));
276/// ```
277#[derive(Debug, Clone, PartialEq)]
278pub enum PdfObject {
279 /// Null object - represents undefined or absent values
280 Null,
281 /// Boolean value - true or false
282 Boolean(bool),
283 /// Integer number
284 Integer(i64),
285 /// Real (floating-point) number
286 Real(f64),
287 /// String data (literal or hexadecimal)
288 String(PdfString),
289 /// Name object - unique identifier
290 Name(PdfName),
291 /// Array - ordered collection of objects
292 Array(PdfArray),
293 /// Dictionary - unordered key-value pairs
294 Dictionary(PdfDictionary),
295 /// Stream - dictionary with binary data
296 Stream(PdfStream),
297 /// Indirect object reference (object_number, generation_number)
298 Reference(u32, u16),
299}
300
301impl PdfObject {
302 /// Parse a PDF object from a lexer.
303 ///
304 /// Reads tokens from the lexer and constructs the appropriate PDF object.
305 /// Handles all PDF object types including indirect references.
306 ///
307 /// # Arguments
308 ///
309 /// * `lexer` - Token source for parsing
310 ///
311 /// # Returns
312 ///
313 /// The parsed PDF object.
314 ///
315 /// # Errors
316 ///
317 /// Returns an error if:
318 /// - Invalid syntax is encountered
319 /// - Unexpected end of input
320 /// - Malformed object structure
321 ///
322 /// # Example
323 ///
324 /// ```rust,no_run
325 /// use oxidize_pdf::parser::lexer::Lexer;
326 /// use oxidize_pdf::parser::objects::PdfObject;
327 /// use std::io::Cursor;
328 ///
329 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
330 /// let input = b"42";
331 /// let mut lexer = Lexer::new(Cursor::new(input));
332 /// let obj = PdfObject::parse(&mut lexer)?;
333 /// assert_eq!(obj, PdfObject::Integer(42));
334 /// # Ok(())
335 /// # }
336 /// ```
337 pub fn parse<R: Read>(lexer: &mut Lexer<R>) -> ParseResult<Self> {
338 let token = lexer.next_token()?;
339 Self::parse_from_token(lexer, token)
340 }
341
342 /// Parse a PDF object starting from a specific token
343 fn parse_from_token<R: Read>(lexer: &mut Lexer<R>, token: Token) -> ParseResult<Self> {
344 match token {
345 Token::Null => Ok(PdfObject::Null),
346 Token::Boolean(b) => Ok(PdfObject::Boolean(b)),
347 Token::Integer(i) => {
348 // For negative numbers or large values, don't check for references
349 if !(0..=9999999).contains(&i) {
350 return Ok(PdfObject::Integer(i));
351 }
352
353 // Check if this is part of a reference (e.g., "1 0 R")
354 match lexer.next_token()? {
355 Token::Integer(gen) if (0..=65535).contains(&gen) => {
356 // Might be a reference, check for 'R'
357 match lexer.next_token()? {
358 Token::Name(s) if s == "R" => {
359 Ok(PdfObject::Reference(i as u32, gen as u16))
360 }
361 token => {
362 // Not a reference, push back the tokens
363 lexer.push_token(token);
364 lexer.push_token(Token::Integer(gen));
365 Ok(PdfObject::Integer(i))
366 }
367 }
368 }
369 token => {
370 // Not a reference, just an integer
371 lexer.push_token(token);
372 Ok(PdfObject::Integer(i))
373 }
374 }
375 }
376 Token::Real(r) => Ok(PdfObject::Real(r)),
377 Token::String(s) => Ok(PdfObject::String(PdfString(s))),
378 Token::Name(n) => Ok(PdfObject::Name(PdfName(n))),
379 Token::ArrayStart => Self::parse_array(lexer),
380 Token::DictStart => Self::parse_dictionary_or_stream(lexer),
381 Token::Comment(_) => {
382 // Skip comments and parse next object
383 Self::parse(lexer)
384 }
385 Token::StartXRef => {
386 // This is a PDF structure marker, not a parseable object
387 Err(ParseError::SyntaxError {
388 position: 0,
389 message: "StartXRef encountered - this is not a PDF object".to_string(),
390 })
391 }
392 Token::Eof => Err(ParseError::SyntaxError {
393 position: 0,
394 message: "Unexpected end of file".to_string(),
395 }),
396 _ => Err(ParseError::UnexpectedToken {
397 expected: "PDF object".to_string(),
398 found: format!("{token:?}"),
399 }),
400 }
401 }
402
403 /// Parse a PDF array
404 fn parse_array<R: Read>(lexer: &mut Lexer<R>) -> ParseResult<Self> {
405 let mut elements = Vec::new();
406
407 loop {
408 let token = lexer.next_token()?;
409 match token {
410 Token::ArrayEnd => break,
411 Token::Comment(_) => continue, // Skip comments
412 _ => {
413 let obj = Self::parse_from_token(lexer, token)?;
414 elements.push(obj);
415 }
416 }
417 }
418
419 Ok(PdfObject::Array(PdfArray(elements)))
420 }
421
422 /// Parse a PDF dictionary and check if it's followed by a stream
423 fn parse_dictionary_or_stream<R: Read>(lexer: &mut Lexer<R>) -> ParseResult<Self> {
424 let dict = Self::parse_dictionary_inner(lexer)?;
425
426 // Check if this is followed by a stream
427 loop {
428 let token = lexer.next_token()?;
429 // Check for stream
430 match token {
431 Token::Stream => {
432 // Parse stream data
433 let stream_data = Self::parse_stream_data(lexer, &dict)?;
434 return Ok(PdfObject::Stream(PdfStream {
435 dict,
436 data: stream_data,
437 }));
438 }
439 Token::Comment(_) => {
440 // Skip comment and continue checking
441 continue;
442 }
443 Token::StartXRef => {
444 // This is the end of the PDF structure, not a stream
445 // Push the token back for later processing
446 // Push back StartXRef token
447 lexer.push_token(token);
448 return Ok(PdfObject::Dictionary(dict));
449 }
450 _ => {
451 // Not a stream, just a dictionary
452 // Push the token back for later processing
453 // Push back token
454 lexer.push_token(token);
455 return Ok(PdfObject::Dictionary(dict));
456 }
457 }
458 }
459 }
460
461 /// Parse the inner dictionary
462 fn parse_dictionary_inner<R: Read>(lexer: &mut Lexer<R>) -> ParseResult<PdfDictionary> {
463 let mut dict = HashMap::new();
464
465 loop {
466 let token = lexer.next_token()?;
467 match token {
468 Token::DictEnd => break,
469 Token::Comment(_) => continue, // Skip comments
470 Token::Name(key) => {
471 let value = Self::parse(lexer)?;
472 dict.insert(PdfName(key), value);
473 }
474 _ => {
475 return Err(ParseError::UnexpectedToken {
476 expected: "dictionary key (name) or >>".to_string(),
477 found: format!("{token:?}"),
478 });
479 }
480 }
481 }
482
483 Ok(PdfDictionary(dict))
484 }
485
486 /// Parse stream data
487 fn parse_stream_data<R: Read>(
488 lexer: &mut Lexer<R>,
489 dict: &PdfDictionary,
490 ) -> ParseResult<Vec<u8>> {
491 // Get the stream length from the dictionary
492 let length = dict
493 .0
494 .get(&PdfName("Length".to_string()))
495 .ok_or_else(|| ParseError::MissingKey("Length".to_string()))?;
496
497 let length = match length {
498 PdfObject::Integer(len) => *len as usize,
499 PdfObject::Reference(_, _) => {
500 // In a full implementation, we'd need to resolve this reference
501 // For now, we'll return an error
502 return Err(ParseError::SyntaxError {
503 position: lexer.position(),
504 message: "Stream length references not yet supported".to_string(),
505 });
506 }
507 _ => {
508 return Err(ParseError::SyntaxError {
509 position: lexer.position(),
510 message: "Invalid stream length type".to_string(),
511 });
512 }
513 };
514
515 // Skip the newline after 'stream' keyword
516 lexer.read_newline()?;
517
518 // Read the actual stream data
519 let stream_data = lexer.read_bytes(length)?;
520
521 // Skip optional whitespace before endstream
522 lexer.skip_whitespace()?;
523
524 // Read 'endstream' keyword
525 let token = lexer.next_token()?;
526 match token {
527 Token::EndStream => Ok(stream_data),
528 _ => Err(ParseError::UnexpectedToken {
529 expected: "endstream".to_string(),
530 found: format!("{token:?}"),
531 }),
532 }
533 }
534
535 /// Check if this object is null.
536 ///
537 /// # Example
538 ///
539 /// ```rust
540 /// use oxidize_pdf::parser::objects::PdfObject;
541 ///
542 /// assert!(PdfObject::Null.is_null());
543 /// assert!(!PdfObject::Integer(42).is_null());
544 /// ```
545 pub fn is_null(&self) -> bool {
546 matches!(self, PdfObject::Null)
547 }
548
549 /// Get the value as a boolean if this is a Boolean object.
550 ///
551 /// # Returns
552 ///
553 /// Some(bool) if this is a Boolean object, None otherwise.
554 ///
555 /// # Example
556 ///
557 /// ```rust
558 /// use oxidize_pdf::parser::objects::PdfObject;
559 ///
560 /// let obj = PdfObject::Boolean(true);
561 /// assert_eq!(obj.as_bool(), Some(true));
562 ///
563 /// let obj = PdfObject::Integer(1);
564 /// assert_eq!(obj.as_bool(), None);
565 /// ```
566 pub fn as_bool(&self) -> Option<bool> {
567 match self {
568 PdfObject::Boolean(b) => Some(*b),
569 _ => None,
570 }
571 }
572
573 /// Get as integer
574 pub fn as_integer(&self) -> Option<i64> {
575 match self {
576 PdfObject::Integer(i) => Some(*i),
577 _ => None,
578 }
579 }
580
581 /// Get the value as a real number.
582 ///
583 /// Returns the value for both Real and Integer objects,
584 /// converting integers to floating-point.
585 ///
586 /// # Returns
587 ///
588 /// Some(f64) if this is a numeric object, None otherwise.
589 ///
590 /// # Example
591 ///
592 /// ```rust
593 /// use oxidize_pdf::parser::objects::PdfObject;
594 ///
595 /// let real_obj = PdfObject::Real(3.14);
596 /// assert_eq!(real_obj.as_real(), Some(3.14));
597 ///
598 /// let int_obj = PdfObject::Integer(42);
599 /// assert_eq!(int_obj.as_real(), Some(42.0));
600 /// ```
601 pub fn as_real(&self) -> Option<f64> {
602 match self {
603 PdfObject::Real(r) => Some(*r),
604 PdfObject::Integer(i) => Some(*i as f64),
605 _ => None,
606 }
607 }
608
609 /// Get as string
610 pub fn as_string(&self) -> Option<&PdfString> {
611 match self {
612 PdfObject::String(s) => Some(s),
613 _ => None,
614 }
615 }
616
617 /// Get as name
618 pub fn as_name(&self) -> Option<&PdfName> {
619 match self {
620 PdfObject::Name(n) => Some(n),
621 _ => None,
622 }
623 }
624
625 /// Get as array
626 pub fn as_array(&self) -> Option<&PdfArray> {
627 match self {
628 PdfObject::Array(a) => Some(a),
629 _ => None,
630 }
631 }
632
633 /// Get as dictionary
634 pub fn as_dict(&self) -> Option<&PdfDictionary> {
635 match self {
636 PdfObject::Dictionary(d) => Some(d),
637 PdfObject::Stream(s) => Some(&s.dict),
638 _ => None,
639 }
640 }
641
642 /// Get as stream
643 pub fn as_stream(&self) -> Option<&PdfStream> {
644 match self {
645 PdfObject::Stream(s) => Some(s),
646 _ => None,
647 }
648 }
649
650 /// Get the object reference if this is a Reference object.
651 ///
652 /// # Returns
653 ///
654 /// Some((object_number, generation_number)) if this is a Reference, None otherwise.
655 ///
656 /// # Example
657 ///
658 /// ```rust
659 /// use oxidize_pdf::parser::objects::PdfObject;
660 ///
661 /// let obj = PdfObject::Reference(10, 0);
662 /// assert_eq!(obj.as_reference(), Some((10, 0)));
663 ///
664 /// // Use for resolving references
665 /// if let Some((obj_num, gen_num)) = obj.as_reference() {
666 /// println!("Reference to {} {} R", obj_num, gen_num);
667 /// }
668 /// ```
669 pub fn as_reference(&self) -> Option<(u32, u16)> {
670 match self {
671 PdfObject::Reference(obj, gen) => Some((*obj, *gen)),
672 _ => None,
673 }
674 }
675}
676
677impl Default for PdfDictionary {
678 fn default() -> Self {
679 Self::new()
680 }
681}
682
683impl PdfDictionary {
684 /// Create a new empty dictionary.
685 ///
686 /// # Example
687 ///
688 /// ```rust
689 /// use oxidize_pdf::parser::objects::{PdfDictionary, PdfObject, PdfName};
690 ///
691 /// let mut dict = PdfDictionary::new();
692 /// dict.insert("Type".to_string(), PdfObject::Name(PdfName::new("Font".to_string())));
693 /// ```
694 pub fn new() -> Self {
695 PdfDictionary(HashMap::new())
696 }
697
698 /// Get a value by key name.
699 ///
700 /// # Arguments
701 ///
702 /// * `key` - The key name (without leading slash)
703 ///
704 /// # Returns
705 ///
706 /// Reference to the value if the key exists, None otherwise.
707 ///
708 /// # Example
709 ///
710 /// ```rust
711 /// use oxidize_pdf::parser::objects::{PdfDictionary, PdfObject};
712 ///
713 /// let mut dict = PdfDictionary::new();
714 /// dict.insert("Length".to_string(), PdfObject::Integer(1000));
715 ///
716 /// if let Some(length) = dict.get("Length").and_then(|o| o.as_integer()) {
717 /// println!("Stream length: {}", length);
718 /// }
719 /// ```
720 pub fn get(&self, key: &str) -> Option<&PdfObject> {
721 self.0.get(&PdfName(key.to_string()))
722 }
723
724 /// Insert a key-value pair
725 pub fn insert(&mut self, key: String, value: PdfObject) {
726 self.0.insert(PdfName(key), value);
727 }
728
729 /// Check if dictionary contains a key
730 pub fn contains_key(&self, key: &str) -> bool {
731 self.0.contains_key(&PdfName(key.to_string()))
732 }
733
734 /// Get the dictionary type (value of /Type key).
735 ///
736 /// Many PDF dictionaries have a /Type entry that identifies their purpose.
737 ///
738 /// # Returns
739 ///
740 /// The type name if present, None otherwise.
741 ///
742 /// # Common Types
743 ///
744 /// - "Catalog" - Document catalog
745 /// - "Page" - Page object
746 /// - "Pages" - Page tree node
747 /// - "Font" - Font dictionary
748 /// - "XObject" - External object
749 ///
750 /// # Example
751 ///
752 /// ```rust
753 /// use oxidize_pdf::parser::objects::{PdfDictionary, PdfObject, PdfName};
754 ///
755 /// let mut dict = PdfDictionary::new();
756 /// dict.insert("Type".to_string(), PdfObject::Name(PdfName::new("Page".to_string())));
757 /// assert_eq!(dict.get_type(), Some("Page"));
758 /// ```
759 pub fn get_type(&self) -> Option<&str> {
760 self.get("Type")
761 .and_then(|obj| obj.as_name())
762 .map(|n| n.0.as_str())
763 }
764}
765
766impl Default for PdfArray {
767 fn default() -> Self {
768 Self::new()
769 }
770}
771
772impl PdfArray {
773 /// Create a new empty array
774 pub fn new() -> Self {
775 PdfArray(Vec::new())
776 }
777
778 /// Get array length
779 pub fn len(&self) -> usize {
780 self.0.len()
781 }
782
783 /// Check if array is empty
784 pub fn is_empty(&self) -> bool {
785 self.0.is_empty()
786 }
787
788 /// Get element at index.
789 ///
790 /// # Arguments
791 ///
792 /// * `index` - Zero-based index
793 ///
794 /// # Returns
795 ///
796 /// Reference to the element if index is valid, None otherwise.
797 ///
798 /// # Example
799 ///
800 /// ```rust
801 /// use oxidize_pdf::parser::objects::{PdfArray, PdfObject};
802 ///
803 /// let mut array = PdfArray::new();
804 /// array.push(PdfObject::Integer(10));
805 /// array.push(PdfObject::Integer(20));
806 ///
807 /// assert_eq!(array.get(0).and_then(|o| o.as_integer()), Some(10));
808 /// assert_eq!(array.get(1).and_then(|o| o.as_integer()), Some(20));
809 /// assert!(array.get(2).is_none());
810 /// ```
811 pub fn get(&self, index: usize) -> Option<&PdfObject> {
812 self.0.get(index)
813 }
814
815 /// Push an element
816 pub fn push(&mut self, obj: PdfObject) {
817 self.0.push(obj);
818 }
819}
820
821impl PdfString {
822 /// Create a new PDF string
823 pub fn new(data: Vec<u8>) -> Self {
824 PdfString(data)
825 }
826
827 /// Get as UTF-8 string if possible.
828 ///
829 /// Attempts to decode the string bytes as UTF-8.
830 /// Note that PDF strings may use other encodings.
831 ///
832 /// # Returns
833 ///
834 /// Ok(&str) if valid UTF-8, Err otherwise.
835 ///
836 /// # Example
837 ///
838 /// ```rust
839 /// use oxidize_pdf::parser::objects::PdfString;
840 ///
841 /// let string = PdfString::new(b"Hello".to_vec());
842 /// assert_eq!(string.as_str(), Ok("Hello"));
843 ///
844 /// let binary = PdfString::new(vec![0xFF, 0xFE]);
845 /// assert!(binary.as_str().is_err());
846 /// ```
847 pub fn as_str(&self) -> Result<&str, std::str::Utf8Error> {
848 std::str::from_utf8(&self.0)
849 }
850
851 /// Get as bytes
852 pub fn as_bytes(&self) -> &[u8] {
853 &self.0
854 }
855}
856
857impl PdfName {
858 /// Create a new PDF name
859 pub fn new(name: String) -> Self {
860 PdfName(name)
861 }
862
863 /// Get the name as a string
864 pub fn as_str(&self) -> &str {
865 &self.0
866 }
867}
868
869#[cfg(test)]
870mod tests {
871 use super::*;
872 use std::io::Cursor;
873
874 #[test]
875 fn test_parse_simple_objects() {
876 let input = b"null true false 123 -456 3.14 /Name (Hello)";
877 let mut lexer = Lexer::new(Cursor::new(input));
878
879 assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::Null);
880 assert_eq!(
881 PdfObject::parse(&mut lexer).unwrap(),
882 PdfObject::Boolean(true)
883 );
884 assert_eq!(
885 PdfObject::parse(&mut lexer).unwrap(),
886 PdfObject::Boolean(false)
887 );
888 assert_eq!(
889 PdfObject::parse(&mut lexer).unwrap(),
890 PdfObject::Integer(123)
891 );
892 assert_eq!(
893 PdfObject::parse(&mut lexer).unwrap(),
894 PdfObject::Integer(-456)
895 );
896 assert_eq!(PdfObject::parse(&mut lexer).unwrap(), PdfObject::Real(3.14));
897 assert_eq!(
898 PdfObject::parse(&mut lexer).unwrap(),
899 PdfObject::Name(PdfName("Name".to_string()))
900 );
901 assert_eq!(
902 PdfObject::parse(&mut lexer).unwrap(),
903 PdfObject::String(PdfString(b"Hello".to_vec()))
904 );
905 }
906
907 #[test]
908 fn test_parse_array() {
909 // Test simple array without potential references
910 let input = b"[100 200 300 /Name (test)]";
911 let mut lexer = Lexer::new(Cursor::new(input));
912
913 let obj = PdfObject::parse(&mut lexer).unwrap();
914 let array = obj.as_array().unwrap();
915
916 assert_eq!(array.len(), 5);
917 assert_eq!(array.get(0).unwrap().as_integer(), Some(100));
918 assert_eq!(array.get(1).unwrap().as_integer(), Some(200));
919 assert_eq!(array.get(2).unwrap().as_integer(), Some(300));
920 assert_eq!(array.get(3).unwrap().as_name().unwrap().as_str(), "Name");
921 assert_eq!(
922 array.get(4).unwrap().as_string().unwrap().as_bytes(),
923 b"test"
924 );
925 }
926
927 #[test]
928 fn test_parse_array_with_references() {
929 // Test array with references
930 let input = b"[1 0 R 2 0 R]";
931 let mut lexer = Lexer::new(Cursor::new(input));
932
933 let obj = PdfObject::parse(&mut lexer).unwrap();
934 let array = obj.as_array().unwrap();
935
936 assert_eq!(array.len(), 2);
937 assert!(array.get(0).unwrap().as_reference().is_some());
938 assert!(array.get(1).unwrap().as_reference().is_some());
939 }
940
941 #[test]
942 fn test_parse_dictionary() {
943 let input = b"<< /Type /Page /Parent 1 0 R /MediaBox [0 0 612 792] >>";
944 let mut lexer = Lexer::new(Cursor::new(input));
945
946 let obj = PdfObject::parse(&mut lexer).unwrap();
947 let dict = obj.as_dict().unwrap();
948
949 assert_eq!(dict.get_type(), Some("Page"));
950 assert!(dict.get("Parent").unwrap().as_reference().is_some());
951 assert!(dict.get("MediaBox").unwrap().as_array().is_some());
952 }
953}