Skip to main content

oxidize_pdf/parser/
content.rs

1//! PDF Content Stream Parser - Complete support for PDF graphics operators
2//!
3//! This module implements comprehensive parsing of PDF content streams according to the PDF specification.
4//! Content streams contain the actual drawing instructions (operators) that render text, graphics, and images
5//! on PDF pages.
6//!
7//! # Overview
8//!
9//! Content streams are sequences of PDF operators that describe:
10//! - Text positioning and rendering
11//! - Path construction and painting
12//! - Color and graphics state management
13//! - Image and XObject placement
14//! - Coordinate transformations
15//!
16//! # Architecture
17//!
18//! The parser is divided into two main components:
19//! - `ContentTokenizer`: Low-level tokenization of content stream bytes
20//! - `ContentParser`: High-level parsing of tokens into structured operations
21//!
22//! # Example
23//!
24//! ```rust,no_run
25//! use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
26//!
27//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
28//! // Parse a content stream
29//! let content_stream = b"BT /F1 12 Tf 100 200 Td (Hello World) Tj ET";
30//! let operations = ContentParser::parse_content(content_stream)?;
31//!
32//! // Process operations
33//! for op in operations {
34//!     match op {
35//!         ContentOperation::BeginText => println!("Start text object"),
36//!         ContentOperation::SetFont(name, size) => println!("Font: {} at {}", name, size),
37//!         ContentOperation::ShowText(text) => println!("Text: {:?}", text),
38//!         _ => {}
39//!     }
40//! }
41//! # Ok(())
42//! # }
43//! ```
44//!
45//! # Supported Operators
46//!
47//! This parser supports all standard PDF operators including:
48//! - Text operators (BT, ET, Tj, TJ, Tf, Td, etc.)
49//! - Graphics state operators (q, Q, cm, w, J, etc.)
50//! - Path construction operators (m, l, c, re, h)
51//! - Path painting operators (S, f, B, n, etc.)
52//! - Color operators (g, rg, k, cs, scn, etc.)
53//! - XObject operators (Do)
54//! - Marked content operators (BMC, BDC, EMC, etc.)
55
56use super::{ParseError, ParseResult};
57use crate::objects::Object;
58use std::collections::HashMap;
59
60/// Represents a single operator in a PDF content stream.
61///
62/// Each variant corresponds to a specific PDF operator and carries the associated
63/// operands. These operations form a complete instruction set for rendering PDF content.
64///
65/// # Categories
66///
67/// Operations are grouped into several categories:
68/// - **Text Object**: BeginText, EndText
69/// - **Text State**: Font, spacing, scaling, rendering mode
70/// - **Text Positioning**: Matrix transforms, moves, line advances
71/// - **Text Showing**: Display text with various formatting
72/// - **Graphics State**: Save/restore, transforms, line properties
73/// - **Path Construction**: Move, line, curve, rectangle operations
74/// - **Path Painting**: Stroke, fill, clipping operations
75/// - **Color**: RGB, CMYK, grayscale, and color space operations
76/// - **XObject**: External graphics and form placement
77/// - **Marked Content**: Semantic tagging for accessibility
78///
79/// # Example
80///
81/// ```rust
82/// use oxidize_pdf::parser::content::{ContentOperation};
83///
84/// // Text operation
85/// let op1 = ContentOperation::ShowText(b"Hello".to_vec());
86///
87/// // Graphics operation
88/// let op2 = ContentOperation::SetLineWidth(2.0);
89///
90/// // Path operation
91/// let op3 = ContentOperation::Rectangle(10.0, 10.0, 100.0, 50.0);
92/// ```
93#[derive(Debug, Clone, PartialEq)]
94pub enum ContentOperation {
95    // Text object operators
96    /// Begin a text object (BT operator).
97    /// All text showing operations must occur within a text object.
98    BeginText,
99
100    /// End a text object (ET operator).
101    /// Closes the current text object started with BeginText.
102    EndText,
103
104    // Text state operators
105    /// Set character spacing (Tc operator).
106    /// Additional space between characters in unscaled text units.
107    SetCharSpacing(f32),
108
109    /// Set word spacing (Tw operator).
110    /// Additional space for ASCII space character (0x20) in unscaled text units.
111    SetWordSpacing(f32),
112
113    /// Set horizontal text scaling (Tz operator).
114    /// Percentage of normal width (100 = normal).
115    SetHorizontalScaling(f32),
116
117    /// Set text leading (TL operator).
118    /// Vertical distance between baselines for T* operator.
119    SetLeading(f32),
120
121    /// Set font and size (Tf operator).
122    /// Font name must match a key in the Resources/Font dictionary.
123    SetFont(String, f32),
124
125    /// Set text rendering mode (Tr operator).
126    /// 0=fill, 1=stroke, 2=fill+stroke, 3=invisible, 4=fill+clip, 5=stroke+clip, 6=fill+stroke+clip, 7=clip
127    SetTextRenderMode(i32),
128
129    /// Set text rise (Ts operator).
130    /// Vertical displacement for superscripts/subscripts in text units.
131    SetTextRise(f32),
132
133    // Text positioning operators
134    /// Move text position (Td operator).
135    /// Translates the text matrix by (tx, ty).
136    MoveText(f32, f32),
137
138    /// Move text position and set leading (TD operator).
139    /// Equivalent to: -ty TL tx ty Td
140    MoveTextSetLeading(f32, f32),
141
142    /// Set text matrix directly (Tm operator).
143    /// Parameters: [a, b, c, d, e, f] for transformation matrix.
144    SetTextMatrix(f32, f32, f32, f32, f32, f32),
145
146    /// Move to start of next line (T* operator).
147    /// Uses the current leading value set with TL.
148    NextLine,
149
150    // Text showing operators
151    /// Show text string (Tj operator).
152    /// The bytes are encoded according to the current font's encoding.
153    ShowText(Vec<u8>),
154
155    /// Show text with individual positioning (TJ operator).
156    /// Array elements can be strings or position adjustments.
157    ShowTextArray(Vec<TextElement>),
158
159    /// Move to next line and show text (' operator).
160    /// Equivalent to: T* string Tj
161    NextLineShowText(Vec<u8>),
162
163    /// Set spacing, move to next line, and show text (" operator).
164    /// Equivalent to: word_spacing Tw char_spacing Tc string '
165    SetSpacingNextLineShowText(f32, f32, Vec<u8>),
166
167    // Graphics state operators
168    /// Save current graphics state (q operator).
169    /// Pushes the entire graphics state onto a stack.
170    SaveGraphicsState,
171
172    /// Restore graphics state (Q operator).
173    /// Pops the graphics state from the stack.
174    RestoreGraphicsState,
175
176    /// Concatenate matrix to current transformation matrix (cm operator).
177    /// Modifies the CTM: CTM' = CTM × [a b c d e f]
178    SetTransformMatrix(f32, f32, f32, f32, f32, f32),
179
180    /// Set line width (w operator) in user space units.
181    SetLineWidth(f32),
182
183    /// Set line cap style (J operator).
184    /// 0=butt cap, 1=round cap, 2=projecting square cap
185    SetLineCap(i32),
186
187    /// Set line join style (j operator).
188    /// 0=miter join, 1=round join, 2=bevel join
189    SetLineJoin(i32),
190
191    /// Set miter limit (M operator).
192    /// Maximum ratio of miter length to line width.
193    SetMiterLimit(f32),
194
195    /// Set dash pattern (d operator).
196    /// Array of dash/gap lengths and starting phase.
197    SetDashPattern(Vec<f32>, f32),
198
199    /// Set rendering intent (ri operator).
200    /// Color rendering intent: /AbsoluteColorimetric, /RelativeColorimetric, /Saturation, /Perceptual
201    SetIntent(String),
202
203    /// Set flatness tolerance (i operator).
204    /// Maximum error when rendering curves as line segments.
205    SetFlatness(f32),
206
207    /// Set graphics state from parameter dictionary (gs operator).
208    /// References ExtGState resource dictionary.
209    SetGraphicsStateParams(String),
210
211    // Path construction operators
212    /// Begin new subpath at point (m operator).
213    MoveTo(f32, f32),
214
215    /// Append straight line segment (l operator).
216    LineTo(f32, f32),
217
218    /// Append cubic Bézier curve (c operator).
219    /// Control points: (x1,y1), (x2,y2), endpoint: (x3,y3)
220    CurveTo(f32, f32, f32, f32, f32, f32),
221
222    /// Append cubic Bézier curve with first control point = current point (v operator).
223    CurveToV(f32, f32, f32, f32),
224
225    /// Append cubic Bézier curve with second control point = endpoint (y operator).
226    CurveToY(f32, f32, f32, f32),
227
228    /// Close current subpath (h operator).
229    /// Appends straight line to starting point.
230    ClosePath,
231
232    /// Append rectangle as complete subpath (re operator).
233    /// Parameters: x, y, width, height
234    Rectangle(f32, f32, f32, f32),
235
236    // Path painting operators
237    /// Stroke the path (S operator).
238    Stroke,
239
240    /// Close and stroke the path (s operator).
241    /// Equivalent to: h S
242    CloseStroke,
243
244    /// Fill the path using nonzero winding rule (f or F operator).
245    Fill,
246
247    /// Fill the path using even-odd rule (f* operator).
248    FillEvenOdd,
249
250    /// Fill then stroke the path (B operator).
251    /// Uses nonzero winding rule.
252    FillStroke,
253
254    /// Fill then stroke using even-odd rule (B* operator).
255    FillStrokeEvenOdd,
256
257    /// Close, fill, and stroke the path (b operator).
258    /// Equivalent to: h B
259    CloseFillStroke,
260
261    /// Close, fill, and stroke using even-odd rule (b* operator).
262    CloseFillStrokeEvenOdd,
263
264    /// End path without filling or stroking (n operator).
265    /// Used primarily before clipping.
266    EndPath,
267
268    // Clipping path operators
269    Clip,        // W
270    ClipEvenOdd, // W*
271
272    // Color operators
273    /// Set stroking color space (CS operator).
274    /// References ColorSpace resource dictionary.
275    SetStrokingColorSpace(String),
276
277    /// Set non-stroking color space (cs operator).
278    /// References ColorSpace resource dictionary.
279    SetNonStrokingColorSpace(String),
280
281    /// Set stroking color (SC, SCN operators).
282    /// Number of components depends on current color space.
283    SetStrokingColor(Vec<f32>),
284
285    /// Set non-stroking color (sc, scn operators).
286    /// Number of components depends on current color space.
287    SetNonStrokingColor(Vec<f32>),
288
289    /// Set stroking color to DeviceGray (G operator).
290    /// 0.0 = black, 1.0 = white
291    SetStrokingGray(f32),
292
293    /// Set non-stroking color to DeviceGray (g operator).
294    SetNonStrokingGray(f32),
295
296    /// Set stroking color to DeviceRGB (RG operator).
297    /// Components range from 0.0 to 1.0.
298    SetStrokingRGB(f32, f32, f32),
299
300    /// Set non-stroking color to DeviceRGB (rg operator).
301    SetNonStrokingRGB(f32, f32, f32),
302
303    /// Set stroking color to DeviceCMYK (K operator).
304    SetStrokingCMYK(f32, f32, f32, f32),
305
306    /// Set non-stroking color to DeviceCMYK (k operator).
307    SetNonStrokingCMYK(f32, f32, f32, f32),
308
309    // Shading operators
310    ShadingFill(String), // sh
311
312    // Inline image operators
313    /// Begin inline image (BI operator)
314    BeginInlineImage,
315    /// Inline image with parsed dictionary and data
316    InlineImage {
317        /// Image parameters (width, height, colorspace, etc.)
318        params: HashMap<String, Object>,
319        /// Raw image data
320        data: Vec<u8>,
321    },
322
323    // XObject operators
324    /// Paint external object (Do operator).
325    /// References XObject resource dictionary (images, forms).
326    PaintXObject(String),
327
328    // Marked content operators
329    BeginMarkedContent(String),                                   // BMC
330    BeginMarkedContentWithProps(String, HashMap<String, String>), // BDC
331    EndMarkedContent,                                             // EMC
332    DefineMarkedContentPoint(String),                             // MP
333    DefineMarkedContentPointWithProps(String, HashMap<String, String>), // DP
334
335    // Compatibility operators
336    BeginCompatibility, // BX
337    EndCompatibility,   // EX
338}
339
340/// Represents a text element in a TJ array for ShowTextArray operations.
341///
342/// The TJ operator takes an array of strings and position adjustments,
343/// allowing fine control over character and word spacing.
344///
345/// # Example
346///
347/// ```rust
348/// use oxidize_pdf::parser::content::{TextElement, ContentOperation};
349///
350/// // TJ array: [(Hello) -50 (World)]
351/// let tj_array = vec![
352///     TextElement::Text(b"Hello".to_vec()),
353///     TextElement::Spacing(-50.0), // Move left 50 units
354///     TextElement::Text(b"World".to_vec()),
355/// ];
356/// let op = ContentOperation::ShowTextArray(tj_array);
357/// ```
358#[derive(Debug, Clone, PartialEq)]
359pub enum TextElement {
360    /// Text string to show
361    Text(Vec<u8>),
362    /// Position adjustment in thousandths of text space units
363    /// Negative values move to the right (decrease spacing)
364    Spacing(f32),
365}
366
367/// Token types in content streams
368#[derive(Debug, Clone, PartialEq)]
369pub(super) enum Token {
370    Number(f32),
371    Integer(i32),
372    String(Vec<u8>),
373    HexString(Vec<u8>),
374    Name(String),
375    Operator(String),
376    ArrayStart,
377    ArrayEnd,
378    DictStart,
379    DictEnd,
380    /// Raw binary data between ID and EI in an inline image.
381    /// The tokenizer captures this as opaque bytes to prevent
382    /// binary image data from being mis-parsed as operators.
383    InlineImageData(Vec<u8>),
384}
385
386/// Content stream tokenizer
387pub struct ContentTokenizer<'a> {
388    input: &'a [u8],
389    position: usize,
390    /// Set after returning an "ID" operator token.
391    /// The next call to next_token() will read raw inline image bytes.
392    in_inline_image: bool,
393}
394
395impl<'a> ContentTokenizer<'a> {
396    /// Create a new tokenizer for the given input
397    pub fn new(input: &'a [u8]) -> Self {
398        Self {
399            input,
400            position: 0,
401            in_inline_image: false,
402        }
403    }
404
405    /// Get the next token from the stream
406    pub(super) fn next_token(&mut self) -> ParseResult<Option<Token>> {
407        // If we just returned an "ID" token, read raw inline image binary data
408        if self.in_inline_image {
409            self.in_inline_image = false;
410            return self.read_inline_image_data();
411        }
412
413        self.skip_whitespace();
414
415        if self.position >= self.input.len() {
416            return Ok(None);
417        }
418
419        let ch = self.input[self.position];
420
421        match ch {
422            // Numbers
423            b'+' | b'-' | b'.' | b'0'..=b'9' => self.read_number(),
424
425            // Strings
426            b'(' => self.read_literal_string(),
427            b'<' => {
428                if self.peek_next() == Some(b'<') {
429                    self.position += 2;
430                    Ok(Some(Token::DictStart))
431                } else {
432                    self.read_hex_string()
433                }
434            }
435            b'>' => {
436                if self.peek_next() == Some(b'>') {
437                    self.position += 2;
438                    Ok(Some(Token::DictEnd))
439                } else {
440                    Err(ParseError::SyntaxError {
441                        position: self.position,
442                        message: "Unexpected '>'".to_string(),
443                    })
444                }
445            }
446
447            // Arrays
448            b'[' => {
449                self.position += 1;
450                Ok(Some(Token::ArrayStart))
451            }
452            b']' => {
453                self.position += 1;
454                Ok(Some(Token::ArrayEnd))
455            }
456
457            // Names
458            b'/' => self.read_name(),
459
460            // Skip unhandled delimiters (corrupted content / binary data recovery)
461            // These bytes are delimiters in read_operator() but have no valid meaning
462            // at the top level of a content stream. Skipping them prevents infinite loops
463            // where read_operator() would return an empty operator without advancing.
464            b';' | b')' | b'{' | b'}' => {
465                self.position += 1;
466                self.next_token() // Recursively get next valid token
467            }
468
469            // Operators or other tokens
470            _ => {
471                let token = self.read_operator()?;
472                // After "ID" operator, switch to raw binary mode for inline image data
473                if let Some(Token::Operator(ref op)) = token {
474                    if op == "ID" {
475                        self.in_inline_image = true;
476                    }
477                }
478                Ok(token)
479            }
480        }
481    }
482
483    fn skip_whitespace(&mut self) {
484        while self.position < self.input.len() {
485            match self.input[self.position] {
486                b' ' | b'\t' | b'\r' | b'\n' | b'\x0C' => self.position += 1,
487                b'%' => self.skip_comment(),
488                _ => break,
489            }
490        }
491    }
492
493    fn skip_comment(&mut self) {
494        while self.position < self.input.len() && self.input[self.position] != b'\n' {
495            self.position += 1;
496        }
497    }
498
499    fn peek_next(&self) -> Option<u8> {
500        if self.position + 1 < self.input.len() {
501            Some(self.input[self.position + 1])
502        } else {
503            None
504        }
505    }
506
507    fn read_number(&mut self) -> ParseResult<Option<Token>> {
508        let start = self.position;
509        let mut has_dot = false;
510
511        // Handle optional sign
512        if self.position < self.input.len()
513            && (self.input[self.position] == b'+' || self.input[self.position] == b'-')
514        {
515            self.position += 1;
516        }
517
518        // Read digits and optional decimal point
519        while self.position < self.input.len() {
520            match self.input[self.position] {
521                b'0'..=b'9' => self.position += 1,
522                b'.' if !has_dot => {
523                    has_dot = true;
524                    self.position += 1;
525                }
526                _ => break,
527            }
528        }
529
530        let num_str = std::str::from_utf8(&self.input[start..self.position]).map_err(|_| {
531            ParseError::SyntaxError {
532                position: start,
533                message: "Invalid number format".to_string(),
534            }
535        })?;
536
537        if has_dot {
538            let value = num_str
539                .parse::<f32>()
540                .map_err(|_| ParseError::SyntaxError {
541                    position: start,
542                    message: "Invalid float number".to_string(),
543                })?;
544            Ok(Some(Token::Number(value)))
545        } else {
546            let value = num_str
547                .parse::<i32>()
548                .map_err(|_| ParseError::SyntaxError {
549                    position: start,
550                    message: "Invalid integer number".to_string(),
551                })?;
552            Ok(Some(Token::Integer(value)))
553        }
554    }
555
556    fn read_literal_string(&mut self) -> ParseResult<Option<Token>> {
557        self.position += 1; // Skip opening '('
558        let mut result = Vec::new();
559        let mut paren_depth = 1;
560        let mut escape = false;
561
562        while self.position < self.input.len() && paren_depth > 0 {
563            let ch = self.input[self.position];
564            self.position += 1;
565
566            if escape {
567                match ch {
568                    b'n' => result.push(b'\n'),
569                    b'r' => result.push(b'\r'),
570                    b't' => result.push(b'\t'),
571                    b'b' => result.push(b'\x08'),
572                    b'f' => result.push(b'\x0C'),
573                    b'(' => result.push(b'('),
574                    b')' => result.push(b')'),
575                    b'\\' => result.push(b'\\'),
576                    b'0'..=b'7' => {
577                        // Octal escape sequence
578                        self.position -= 1;
579                        let octal_value = self.read_octal_escape()?;
580                        result.push(octal_value);
581                    }
582                    _ => result.push(ch), // Unknown escape, treat as literal
583                }
584                escape = false;
585            } else {
586                match ch {
587                    b'\\' => escape = true,
588                    b'(' => {
589                        paren_depth += 1;
590                        result.push(ch);
591                    }
592                    b')' => {
593                        paren_depth -= 1;
594                        if paren_depth > 0 {
595                            result.push(ch);
596                        }
597                    }
598                    _ => result.push(ch),
599                }
600            }
601        }
602
603        Ok(Some(Token::String(result)))
604    }
605
606    fn read_octal_escape(&mut self) -> ParseResult<u8> {
607        // Use u16 to avoid overflow panic on malformed octal sequences (e.g. \777).
608        // Per ISO 32000-1:2008 §7.3.4.2: "high-order overflow shall be ignored".
609        let mut value = 0u16;
610        let mut count = 0;
611
612        while count < 3 && self.position < self.input.len() {
613            match self.input[self.position] {
614                b'0'..=b'7' => {
615                    value = value * 8 + u16::from(self.input[self.position] - b'0');
616                    self.position += 1;
617                    count += 1;
618                }
619                _ => break,
620            }
621        }
622
623        Ok(value as u8)
624    }
625
626    fn read_hex_string(&mut self) -> ParseResult<Option<Token>> {
627        self.position += 1; // Skip opening '<'
628        let mut result = Vec::new();
629        let mut nibble = None;
630
631        while self.position < self.input.len() {
632            let ch = self.input[self.position];
633
634            match ch {
635                b'>' => {
636                    self.position += 1;
637                    // Handle odd number of hex digits
638                    if let Some(n) = nibble {
639                        result.push(n << 4);
640                    }
641                    return Ok(Some(Token::HexString(result)));
642                }
643                b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
644                    let digit = if ch <= b'9' {
645                        ch - b'0'
646                    } else if ch <= b'F' {
647                        ch - b'A' + 10
648                    } else {
649                        ch - b'a' + 10
650                    };
651
652                    if let Some(n) = nibble {
653                        result.push((n << 4) | digit);
654                        nibble = None;
655                    } else {
656                        nibble = Some(digit);
657                    }
658                    self.position += 1;
659                }
660                b' ' | b'\t' | b'\r' | b'\n' | b'\x0C' => {
661                    // Skip whitespace in hex strings
662                    self.position += 1;
663                }
664                _ => {
665                    return Err(ParseError::SyntaxError {
666                        position: self.position,
667                        message: format!("Invalid character in hex string: {:?}", ch as char),
668                    });
669                }
670            }
671        }
672
673        Err(ParseError::SyntaxError {
674            position: self.position,
675            message: "Unterminated hex string".to_string(),
676        })
677    }
678
679    fn read_name(&mut self) -> ParseResult<Option<Token>> {
680        self.position += 1; // Skip '/'
681        let start = self.position;
682
683        while self.position < self.input.len() {
684            let ch = self.input[self.position];
685            match ch {
686                b' ' | b'\t' | b'\r' | b'\n' | b'\x0C' | b'(' | b')' | b'<' | b'>' | b'['
687                | b']' | b'{' | b'}' | b'/' | b'%' => break,
688                b'#' => {
689                    // Handle hex escape in name
690                    self.position += 1;
691                    if self.position + 1 < self.input.len() {
692                        self.position += 2;
693                    }
694                }
695                _ => self.position += 1,
696            }
697        }
698
699        let name_bytes = &self.input[start..self.position];
700        let name = self.decode_name(name_bytes)?;
701        Ok(Some(Token::Name(name)))
702    }
703
704    fn decode_name(&self, bytes: &[u8]) -> ParseResult<String> {
705        let mut result = Vec::new();
706        let mut i = 0;
707
708        while i < bytes.len() {
709            if bytes[i] == b'#' && i + 2 < bytes.len() {
710                // Hex escape
711                let hex_str = std::str::from_utf8(&bytes[i + 1..i + 3]).map_err(|_| {
712                    ParseError::SyntaxError {
713                        position: self.position,
714                        message: "Invalid hex escape in name".to_string(),
715                    }
716                })?;
717                let value =
718                    u8::from_str_radix(hex_str, 16).map_err(|_| ParseError::SyntaxError {
719                        position: self.position,
720                        message: "Invalid hex escape in name".to_string(),
721                    })?;
722                result.push(value);
723                i += 3;
724            } else {
725                result.push(bytes[i]);
726                i += 1;
727            }
728        }
729
730        String::from_utf8(result).map_err(|_| ParseError::SyntaxError {
731            position: self.position,
732            message: "Invalid UTF-8 in name".to_string(),
733        })
734    }
735
736    fn read_operator(&mut self) -> ParseResult<Option<Token>> {
737        let start = self.position;
738
739        while self.position < self.input.len() {
740            let ch = self.input[self.position];
741            match ch {
742                b' ' | b'\t' | b'\r' | b'\n' | b'\x0C' | b'(' | b')' | b'<' | b'>' | b'['
743                | b']' | b'{' | b'}' | b'/' | b'%' | b';' => break,
744                _ => self.position += 1,
745            }
746        }
747
748        let op_bytes = &self.input[start..self.position];
749        let op = std::str::from_utf8(op_bytes).map_err(|_| ParseError::SyntaxError {
750            position: start,
751            message: "Invalid operator".to_string(),
752        })?;
753
754        Ok(Some(Token::Operator(op.to_string())))
755    }
756
757    /// Read raw binary data for an inline image (between ID and EI).
758    ///
759    /// Per PDF spec §4.8.6, after the ID operator and a single whitespace byte,
760    /// all subsequent bytes are raw image data until the EI marker is found.
761    /// The EI marker is: whitespace + 'E' + 'I' + (whitespace, delimiter, or EOF).
762    fn read_inline_image_data(&mut self) -> ParseResult<Option<Token>> {
763        // Skip single whitespace byte after ID (per PDF spec §4.8.6)
764        if self.position < self.input.len() {
765            let ch = self.input[self.position];
766            if ch == b' ' || ch == b'\n' || ch == b'\r' || ch == b'\t' {
767                self.position += 1;
768                // Handle \r\n as single whitespace
769                if ch == b'\r'
770                    && self.position < self.input.len()
771                    && self.input[self.position] == b'\n'
772                {
773                    self.position += 1;
774                }
775            }
776        }
777
778        let start = self.position;
779
780        // Scan for EI marker: preceded by whitespace + 'E' + 'I' + (whitespace/delimiter/EOF)
781        while self.position + 1 < self.input.len() {
782            let preceded_by_whitespace = self.position == start
783                || matches!(
784                    self.input[self.position - 1],
785                    b' ' | b'\t' | b'\r' | b'\n' | b'\x0C'
786                );
787
788            if preceded_by_whitespace
789                && self.input[self.position] == b'E'
790                && self.input[self.position + 1] == b'I'
791            {
792                let after_ei = self.position + 2;
793                let followed_by_boundary = after_ei >= self.input.len()
794                    || matches!(
795                        self.input[after_ei],
796                        b' ' | b'\t' | b'\r' | b'\n' | b'\x0C' | b'/' | b'<' | b'(' | b'[' | b'%'
797                    );
798
799                if followed_by_boundary {
800                    // Trim trailing whitespace that preceded EI from the data
801                    let mut end = self.position;
802                    if end > start
803                        && matches!(self.input[end - 1], b' ' | b'\t' | b'\r' | b'\n' | b'\x0C')
804                    {
805                        end -= 1;
806                    }
807                    let data = self.input[start..end].to_vec();
808                    self.position = after_ei; // Skip past "EI"
809                    return Ok(Some(Token::InlineImageData(data)));
810                }
811            }
812            self.position += 1;
813        }
814
815        // No EI found — return remaining bytes as best-effort recovery
816        let data = self.input[start..].to_vec();
817        self.position = self.input.len();
818        Ok(Some(Token::InlineImageData(data)))
819    }
820}
821
822/// High-level content stream parser.
823///
824/// Converts tokenized content streams into structured `ContentOperation` values.
825/// This parser handles the operand stack and operator parsing according to PDF specifications.
826///
827/// # Usage
828///
829/// The parser is typically used through its static methods:
830///
831/// ```rust
832/// use oxidize_pdf::parser::content::ContentParser;
833///
834/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
835/// let content = b"q 1 0 0 1 50 50 cm 100 100 200 150 re S Q";
836/// let operations = ContentParser::parse(content)?;
837/// # Ok(())
838/// # }
839/// ```
840pub struct ContentParser {
841    tokens: Vec<Token>,
842    position: usize,
843}
844
845impl ContentParser {
846    /// Create a new content parser
847    pub fn new(_content: &[u8]) -> Self {
848        Self {
849            tokens: Vec::new(),
850            position: 0,
851        }
852    }
853
854    /// Parse a content stream into a vector of operators.
855    ///
856    /// This is a convenience method that creates a parser and processes the entire stream.
857    ///
858    /// # Arguments
859    ///
860    /// * `content` - Raw content stream bytes (may be compressed)
861    ///
862    /// # Returns
863    ///
864    /// A vector of parsed `ContentOperation` values in the order they appear.
865    ///
866    /// # Errors
867    ///
868    /// Returns an error if:
869    /// - Invalid operator syntax is encountered
870    /// - Operators have incorrect number/type of operands
871    /// - Unknown operators are found
872    ///
873    /// # Example
874    ///
875    /// ```rust
876    /// use oxidize_pdf::parser::content::{ContentParser, ContentOperation};
877    ///
878    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
879    /// let content = b"BT /F1 12 Tf 100 200 Td (Hello) Tj ET";
880    /// let operations = ContentParser::parse(content)?;
881    ///
882    /// assert_eq!(operations.len(), 5);
883    /// assert!(matches!(operations[0], ContentOperation::BeginText));
884    /// # Ok(())
885    /// # }
886    /// ```
887    pub fn parse(content: &[u8]) -> ParseResult<Vec<ContentOperation>> {
888        Self::parse_content(content)
889    }
890
891    /// Parse a content stream into a vector of operators.
892    ///
893    /// This method tokenizes the input and converts it to operations.
894    /// It handles the PDF postfix notation where operands precede operators.
895    pub fn parse_content(content: &[u8]) -> ParseResult<Vec<ContentOperation>> {
896        let mut tokenizer = ContentTokenizer::new(content);
897        let mut tokens = Vec::new();
898
899        // Tokenize the entire stream
900        while let Some(token) = tokenizer.next_token()? {
901            tokens.push(token);
902        }
903
904        let mut parser = Self {
905            tokens,
906            position: 0,
907        };
908
909        parser.parse_operators()
910    }
911
912    fn parse_operators(&mut self) -> ParseResult<Vec<ContentOperation>> {
913        let mut operators = Vec::new();
914        let mut operand_stack: Vec<Token> = Vec::new();
915
916        while self.position < self.tokens.len() {
917            let token = self.tokens[self.position].clone();
918            self.position += 1;
919
920            match &token {
921                Token::Operator(op) => {
922                    let operator = self.parse_operator(op, &mut operand_stack)?;
923                    operators.push(operator);
924                }
925                _ => {
926                    // Not an operator, push to operand stack
927                    operand_stack.push(token);
928                }
929            }
930        }
931
932        Ok(operators)
933    }
934
935    fn parse_operator(
936        &mut self,
937        op: &str,
938        operands: &mut Vec<Token>,
939    ) -> ParseResult<ContentOperation> {
940        let operator = match op {
941            // Text object operators
942            "BT" => ContentOperation::BeginText,
943            "ET" => ContentOperation::EndText,
944
945            // Text state operators
946            "Tc" => {
947                let spacing = self.pop_number(operands)?;
948                ContentOperation::SetCharSpacing(spacing)
949            }
950            "Tw" => {
951                let spacing = self.pop_number(operands)?;
952                ContentOperation::SetWordSpacing(spacing)
953            }
954            "Tz" => {
955                let scale = self.pop_number(operands)?;
956                ContentOperation::SetHorizontalScaling(scale)
957            }
958            "TL" => {
959                let leading = self.pop_number(operands)?;
960                ContentOperation::SetLeading(leading)
961            }
962            "Tf" => {
963                let size = self.pop_number(operands)?;
964                let font = self.pop_name(operands)?;
965                ContentOperation::SetFont(font, size)
966            }
967            "Tr" => {
968                let mode = self.pop_integer(operands)?;
969                ContentOperation::SetTextRenderMode(mode)
970            }
971            "Ts" => {
972                let rise = self.pop_number(operands)?;
973                ContentOperation::SetTextRise(rise)
974            }
975
976            // Text positioning operators
977            "Td" => {
978                let ty = self.pop_number(operands)?;
979                let tx = self.pop_number(operands)?;
980                ContentOperation::MoveText(tx, ty)
981            }
982            "TD" => {
983                let ty = self.pop_number(operands)?;
984                let tx = self.pop_number(operands)?;
985                ContentOperation::MoveTextSetLeading(tx, ty)
986            }
987            "Tm" => {
988                let f = self.pop_number(operands)?;
989                let e = self.pop_number(operands)?;
990                let d = self.pop_number(operands)?;
991                let c = self.pop_number(operands)?;
992                let b = self.pop_number(operands)?;
993                let a = self.pop_number(operands)?;
994                ContentOperation::SetTextMatrix(a, b, c, d, e, f)
995            }
996            "T*" => ContentOperation::NextLine,
997
998            // Text showing operators
999            "Tj" => {
1000                let text = self.pop_string(operands)?;
1001                ContentOperation::ShowText(text)
1002            }
1003            "TJ" => {
1004                let array = self.pop_array(operands)?;
1005                let elements = self.parse_text_array(array)?;
1006                ContentOperation::ShowTextArray(elements)
1007            }
1008            "'" => {
1009                let text = self.pop_string(operands)?;
1010                ContentOperation::NextLineShowText(text)
1011            }
1012            "\"" => {
1013                let text = self.pop_string(operands)?;
1014                let aw = self.pop_number(operands)?;
1015                let ac = self.pop_number(operands)?;
1016                ContentOperation::SetSpacingNextLineShowText(ac, aw, text)
1017            }
1018
1019            // Graphics state operators
1020            "q" => ContentOperation::SaveGraphicsState,
1021            "Q" => ContentOperation::RestoreGraphicsState,
1022            "cm" => {
1023                let f = self.pop_number(operands)?;
1024                let e = self.pop_number(operands)?;
1025                let d = self.pop_number(operands)?;
1026                let c = self.pop_number(operands)?;
1027                let b = self.pop_number(operands)?;
1028                let a = self.pop_number(operands)?;
1029                ContentOperation::SetTransformMatrix(a, b, c, d, e, f)
1030            }
1031            "w" => {
1032                let width = self.pop_number(operands)?;
1033                ContentOperation::SetLineWidth(width)
1034            }
1035            "J" => {
1036                let cap = self.pop_integer(operands)?;
1037                ContentOperation::SetLineCap(cap)
1038            }
1039            "j" => {
1040                let join = self.pop_integer(operands)?;
1041                ContentOperation::SetLineJoin(join)
1042            }
1043            "M" => {
1044                let limit = self.pop_number(operands)?;
1045                ContentOperation::SetMiterLimit(limit)
1046            }
1047            "d" => {
1048                let phase = self.pop_number(operands)?;
1049                let array = self.pop_array(operands)?;
1050                let pattern = self.parse_dash_array(array)?;
1051                ContentOperation::SetDashPattern(pattern, phase)
1052            }
1053            "ri" => {
1054                let intent = self.pop_name(operands)?;
1055                ContentOperation::SetIntent(intent)
1056            }
1057            "i" => {
1058                let flatness = self.pop_number(operands)?;
1059                ContentOperation::SetFlatness(flatness)
1060            }
1061            "gs" => {
1062                let name = self.pop_name(operands)?;
1063                ContentOperation::SetGraphicsStateParams(name)
1064            }
1065
1066            // Path construction operators
1067            "m" => {
1068                let y = self.pop_number(operands)?;
1069                let x = self.pop_number(operands)?;
1070                ContentOperation::MoveTo(x, y)
1071            }
1072            "l" => {
1073                let y = self.pop_number(operands)?;
1074                let x = self.pop_number(operands)?;
1075                ContentOperation::LineTo(x, y)
1076            }
1077            "c" => {
1078                let y3 = self.pop_number(operands)?;
1079                let x3 = self.pop_number(operands)?;
1080                let y2 = self.pop_number(operands)?;
1081                let x2 = self.pop_number(operands)?;
1082                let y1 = self.pop_number(operands)?;
1083                let x1 = self.pop_number(operands)?;
1084                ContentOperation::CurveTo(x1, y1, x2, y2, x3, y3)
1085            }
1086            "v" => {
1087                let y3 = self.pop_number(operands)?;
1088                let x3 = self.pop_number(operands)?;
1089                let y2 = self.pop_number(operands)?;
1090                let x2 = self.pop_number(operands)?;
1091                ContentOperation::CurveToV(x2, y2, x3, y3)
1092            }
1093            "y" => {
1094                let y3 = self.pop_number(operands)?;
1095                let x3 = self.pop_number(operands)?;
1096                let y1 = self.pop_number(operands)?;
1097                let x1 = self.pop_number(operands)?;
1098                ContentOperation::CurveToY(x1, y1, x3, y3)
1099            }
1100            "h" => ContentOperation::ClosePath,
1101            "re" => {
1102                let height = self.pop_number(operands)?;
1103                let width = self.pop_number(operands)?;
1104                let y = self.pop_number(operands)?;
1105                let x = self.pop_number(operands)?;
1106                ContentOperation::Rectangle(x, y, width, height)
1107            }
1108
1109            // Path painting operators
1110            "S" => ContentOperation::Stroke,
1111            "s" => ContentOperation::CloseStroke,
1112            "f" | "F" => ContentOperation::Fill,
1113            "f*" => ContentOperation::FillEvenOdd,
1114            "B" => ContentOperation::FillStroke,
1115            "B*" => ContentOperation::FillStrokeEvenOdd,
1116            "b" => ContentOperation::CloseFillStroke,
1117            "b*" => ContentOperation::CloseFillStrokeEvenOdd,
1118            "n" => ContentOperation::EndPath,
1119
1120            // Clipping path operators
1121            "W" => ContentOperation::Clip,
1122            "W*" => ContentOperation::ClipEvenOdd,
1123
1124            // Color operators
1125            "CS" => {
1126                let name = self.pop_name(operands)?;
1127                ContentOperation::SetStrokingColorSpace(name)
1128            }
1129            "cs" => {
1130                let name = self.pop_name(operands)?;
1131                ContentOperation::SetNonStrokingColorSpace(name)
1132            }
1133            "SC" | "SCN" => {
1134                let components = self.pop_color_components(operands)?;
1135                ContentOperation::SetStrokingColor(components)
1136            }
1137            "sc" | "scn" => {
1138                let components = self.pop_color_components(operands)?;
1139                ContentOperation::SetNonStrokingColor(components)
1140            }
1141            "G" => {
1142                let gray = self.pop_number(operands)?;
1143                ContentOperation::SetStrokingGray(gray)
1144            }
1145            "g" => {
1146                let gray = self.pop_number(operands)?;
1147                ContentOperation::SetNonStrokingGray(gray)
1148            }
1149            "RG" => {
1150                let b = self.pop_number(operands)?;
1151                let g = self.pop_number(operands)?;
1152                let r = self.pop_number(operands)?;
1153                ContentOperation::SetStrokingRGB(r, g, b)
1154            }
1155            "rg" => {
1156                let b = self.pop_number(operands)?;
1157                let g = self.pop_number(operands)?;
1158                let r = self.pop_number(operands)?;
1159                ContentOperation::SetNonStrokingRGB(r, g, b)
1160            }
1161            "K" => {
1162                let k = self.pop_number(operands)?;
1163                let y = self.pop_number(operands)?;
1164                let m = self.pop_number(operands)?;
1165                let c = self.pop_number(operands)?;
1166                ContentOperation::SetStrokingCMYK(c, m, y, k)
1167            }
1168            "k" => {
1169                let k = self.pop_number(operands)?;
1170                let y = self.pop_number(operands)?;
1171                let m = self.pop_number(operands)?;
1172                let c = self.pop_number(operands)?;
1173                ContentOperation::SetNonStrokingCMYK(c, m, y, k)
1174            }
1175
1176            // Shading operators
1177            "sh" => {
1178                let name = self.pop_name(operands)?;
1179                ContentOperation::ShadingFill(name)
1180            }
1181
1182            // XObject operators
1183            "Do" => {
1184                let name = self.pop_name(operands)?;
1185                ContentOperation::PaintXObject(name)
1186            }
1187
1188            // Marked content operators
1189            "BMC" => {
1190                let tag = self.pop_name(operands)?;
1191                ContentOperation::BeginMarkedContent(tag)
1192            }
1193            "BDC" => {
1194                let props = self.pop_dict_or_name(operands)?;
1195                let tag = self.pop_name(operands)?;
1196                ContentOperation::BeginMarkedContentWithProps(tag, props)
1197            }
1198            "EMC" => ContentOperation::EndMarkedContent,
1199            "MP" => {
1200                let tag = self.pop_name(operands)?;
1201                ContentOperation::DefineMarkedContentPoint(tag)
1202            }
1203            "DP" => {
1204                let props = self.pop_dict_or_name(operands)?;
1205                let tag = self.pop_name(operands)?;
1206                ContentOperation::DefineMarkedContentPointWithProps(tag, props)
1207            }
1208
1209            // Compatibility operators
1210            "BX" => ContentOperation::BeginCompatibility,
1211            "EX" => ContentOperation::EndCompatibility,
1212
1213            // Inline images are handled specially
1214            "BI" => {
1215                operands.clear(); // Clear any remaining operands
1216                self.parse_inline_image()?
1217            }
1218
1219            _ => {
1220                return Err(ParseError::SyntaxError {
1221                    position: self.position,
1222                    message: format!("Unknown operator: {op}"),
1223                });
1224            }
1225        };
1226
1227        operands.clear(); // Clear operands after processing
1228        Ok(operator)
1229    }
1230
1231    // Helper methods for popping operands
1232    fn pop_number(&self, operands: &mut Vec<Token>) -> ParseResult<f32> {
1233        match operands.pop() {
1234            Some(Token::Number(n)) => Ok(n),
1235            Some(Token::Integer(i)) => Ok(i as f32),
1236            _ => Err(ParseError::SyntaxError {
1237                position: self.position,
1238                message: "Expected number operand".to_string(),
1239            }),
1240        }
1241    }
1242
1243    fn pop_integer(&self, operands: &mut Vec<Token>) -> ParseResult<i32> {
1244        match operands.pop() {
1245            Some(Token::Integer(i)) => Ok(i),
1246            _ => Err(ParseError::SyntaxError {
1247                position: self.position,
1248                message: "Expected integer operand".to_string(),
1249            }),
1250        }
1251    }
1252
1253    fn pop_name(&self, operands: &mut Vec<Token>) -> ParseResult<String> {
1254        match operands.pop() {
1255            Some(Token::Name(n)) => Ok(n),
1256            _ => Err(ParseError::SyntaxError {
1257                position: self.position,
1258                message: "Expected name operand".to_string(),
1259            }),
1260        }
1261    }
1262
1263    fn pop_string(&self, operands: &mut Vec<Token>) -> ParseResult<Vec<u8>> {
1264        match operands.pop() {
1265            Some(Token::String(s)) => Ok(s),
1266            Some(Token::HexString(s)) => Ok(s),
1267            _ => Err(ParseError::SyntaxError {
1268                position: self.position,
1269                message: "Expected string operand".to_string(),
1270            }),
1271        }
1272    }
1273
1274    fn pop_array(&self, operands: &mut Vec<Token>) -> ParseResult<Vec<Token>> {
1275        // First check if we have an ArrayEnd at the top (which we should for a complete array)
1276        let has_array_end = matches!(operands.last(), Some(Token::ArrayEnd));
1277        if has_array_end {
1278            operands.pop(); // Remove the ArrayEnd
1279        }
1280
1281        let mut array = Vec::new();
1282        let mut found_start = false;
1283
1284        // Pop tokens until we find ArrayStart
1285        while let Some(token) = operands.pop() {
1286            match token {
1287                Token::ArrayStart => {
1288                    found_start = true;
1289                    break;
1290                }
1291                Token::ArrayEnd => {
1292                    // Skip any additional ArrayEnd tokens (shouldn't happen in well-formed PDFs)
1293                    continue;
1294                }
1295                _ => array.push(token),
1296            }
1297        }
1298
1299        if !found_start {
1300            return Err(ParseError::SyntaxError {
1301                position: self.position,
1302                message: "Expected array".to_string(),
1303            });
1304        }
1305
1306        array.reverse(); // We collected in reverse order
1307        Ok(array)
1308    }
1309
1310    fn pop_dict_or_name(&self, operands: &mut Vec<Token>) -> ParseResult<HashMap<String, String>> {
1311        if let Some(token) = operands.pop() {
1312            match token {
1313                Token::Name(name) => {
1314                    // Name token - this is a reference to properties in the resource dictionary
1315                    // For now, we'll store it as a special entry to indicate it's a resource reference
1316                    let mut props = HashMap::new();
1317                    props.insert("__resource_ref".to_string(), name);
1318                    Ok(props)
1319                }
1320                Token::DictEnd => {
1321                    // Inline dictionary - tokens are on stack in reverse order:
1322                    // Stack: [..., DictStart, Name("key"), Value, DictEnd] <- top
1323                    // After popping DictEnd, we need to pop value-key pairs until DictStart
1324                    let mut props = HashMap::new();
1325
1326                    // Collect key-value pairs (values come before keys on stack)
1327                    while let Some(value_token) = operands.pop() {
1328                        if matches!(value_token, Token::DictStart) {
1329                            break;
1330                        }
1331
1332                        // In PDF dict syntax: /Key Value
1333                        // On stack after tokenization: [DictStart, Name(Key), Value, ...]
1334                        // Popping gives us: Value first, then Key
1335                        let value = match &value_token {
1336                            Token::Name(name) => name.clone(),
1337                            Token::String(s) => String::from_utf8_lossy(s).to_string(),
1338                            Token::Integer(i) => i.to_string(),
1339                            Token::Number(f) => f.to_string(),
1340                            Token::ArrayEnd => {
1341                                // Array value - collect elements until ArrayStart
1342                                let mut array_elements = Vec::new();
1343                                while let Some(arr_token) = operands.pop() {
1344                                    match arr_token {
1345                                        Token::ArrayStart => break,
1346                                        Token::Name(n) => array_elements.push(n),
1347                                        Token::String(s) => array_elements
1348                                            .push(String::from_utf8_lossy(&s).to_string()),
1349                                        Token::Integer(i) => array_elements.push(i.to_string()),
1350                                        Token::Number(f) => array_elements.push(f.to_string()),
1351                                        _ => {} // Skip other token types in array
1352                                    }
1353                                }
1354                                array_elements.reverse();
1355                                format!("[{}]", array_elements.join(", "))
1356                            }
1357                            _ => continue, // Skip unsupported value types
1358                        };
1359
1360                        // Now pop the key (should be a Name)
1361                        if let Some(Token::Name(key)) = operands.pop() {
1362                            props.insert(key, value);
1363                        }
1364                    }
1365
1366                    Ok(props)
1367                }
1368                _ => {
1369                    // Unexpected token type, treat as empty properties
1370                    Ok(HashMap::new())
1371                }
1372            }
1373        } else {
1374            // No operand available
1375            Err(ParseError::SyntaxError {
1376                position: 0,
1377                message: "Expected dictionary or name for marked content properties".to_string(),
1378            })
1379        }
1380    }
1381
1382    fn pop_color_components(&self, operands: &mut Vec<Token>) -> ParseResult<Vec<f32>> {
1383        let mut components = Vec::new();
1384
1385        // Pop all numeric values from the stack
1386        while let Some(token) = operands.last() {
1387            match token {
1388                Token::Number(n) => {
1389                    components.push(*n);
1390                    operands.pop();
1391                }
1392                Token::Integer(i) => {
1393                    components.push(*i as f32);
1394                    operands.pop();
1395                }
1396                _ => break,
1397            }
1398        }
1399
1400        components.reverse();
1401        Ok(components)
1402    }
1403
1404    fn parse_text_array(&self, tokens: Vec<Token>) -> ParseResult<Vec<TextElement>> {
1405        let mut elements = Vec::new();
1406
1407        for token in tokens {
1408            match token {
1409                Token::String(s) | Token::HexString(s) => {
1410                    elements.push(TextElement::Text(s));
1411                }
1412                Token::Number(n) => {
1413                    elements.push(TextElement::Spacing(n));
1414                }
1415                Token::Integer(i) => {
1416                    elements.push(TextElement::Spacing(i as f32));
1417                }
1418                _ => {
1419                    return Err(ParseError::SyntaxError {
1420                        position: self.position,
1421                        message: "Invalid element in text array".to_string(),
1422                    });
1423                }
1424            }
1425        }
1426
1427        Ok(elements)
1428    }
1429
1430    fn parse_dash_array(&self, tokens: Vec<Token>) -> ParseResult<Vec<f32>> {
1431        let mut pattern = Vec::new();
1432
1433        for token in tokens {
1434            match token {
1435                Token::Number(n) => pattern.push(n),
1436                Token::Integer(i) => pattern.push(i as f32),
1437                _ => {
1438                    return Err(ParseError::SyntaxError {
1439                        position: self.position,
1440                        message: "Invalid element in dash array".to_string(),
1441                    });
1442                }
1443            }
1444        }
1445
1446        Ok(pattern)
1447    }
1448
1449    fn parse_inline_image(&mut self) -> ParseResult<ContentOperation> {
1450        // Parse inline image dictionary until we find ID
1451        let mut params = HashMap::new();
1452
1453        while self.position < self.tokens.len() {
1454            // Check if we've reached the ID operator
1455            if let Token::Operator(op) = &self.tokens[self.position] {
1456                if op == "ID" {
1457                    self.position += 1;
1458                    break;
1459                }
1460            }
1461
1462            // Parse key-value pairs for image parameters
1463            // Keys are abbreviated in inline images:
1464            // /W -> Width, /H -> Height, /CS -> ColorSpace, /BPC -> BitsPerComponent
1465            // /F -> Filter, /DP -> DecodeParms, /IM -> ImageMask, /I -> Interpolate
1466            if let Token::Name(key) = &self.tokens[self.position] {
1467                self.position += 1;
1468                if self.position >= self.tokens.len() {
1469                    break;
1470                }
1471
1472                // Parse the value
1473                let value = match &self.tokens[self.position] {
1474                    Token::Integer(n) => Object::Integer(*n as i64),
1475                    Token::Number(n) => Object::Real(*n as f64),
1476                    Token::Name(s) => Object::Name(expand_inline_name(s)),
1477                    Token::String(s) => Object::String(String::from_utf8_lossy(s).to_string()),
1478                    Token::HexString(s) => Object::String(String::from_utf8_lossy(s).to_string()),
1479                    _ => Object::Null,
1480                };
1481
1482                // Expand abbreviated keys to full names
1483                let full_key = expand_inline_key(key);
1484                params.insert(full_key, value);
1485                self.position += 1;
1486            } else {
1487                self.position += 1;
1488            }
1489        }
1490
1491        // Get inline image data from dedicated InlineImageData token
1492        // (the tokenizer reads raw bytes between ID whitespace and EI)
1493        let data = if self.position < self.tokens.len() {
1494            if let Token::InlineImageData(bytes) = &self.tokens[self.position] {
1495                let d = bytes.clone();
1496                self.position += 1;
1497                d
1498            } else {
1499                // Fallback: collect tokens until EI (for backwards compat with edge cases)
1500                self.collect_inline_image_data_from_tokens()?
1501            }
1502        } else {
1503            Vec::new()
1504        };
1505
1506        Ok(ContentOperation::InlineImage { params, data })
1507    }
1508
1509    /// Fallback data collection when InlineImageData token is not present.
1510    /// This handles edge cases where the tokenizer couldn't detect the ID/EI boundary.
1511    fn collect_inline_image_data_from_tokens(&mut self) -> ParseResult<Vec<u8>> {
1512        let mut data = Vec::new();
1513        while self.position < self.tokens.len() {
1514            if let Token::Operator(op) = &self.tokens[self.position] {
1515                if op == "EI" {
1516                    self.position += 1;
1517                    break;
1518                }
1519            }
1520            match &self.tokens[self.position] {
1521                Token::String(bytes) | Token::HexString(bytes) => {
1522                    data.extend_from_slice(bytes);
1523                }
1524                Token::Integer(n) => data.extend_from_slice(n.to_string().as_bytes()),
1525                Token::Number(n) => data.extend_from_slice(n.to_string().as_bytes()),
1526                Token::Name(s) | Token::Operator(s) => data.extend_from_slice(s.as_bytes()),
1527                _ => {}
1528            }
1529            self.position += 1;
1530        }
1531        Ok(data)
1532    }
1533}
1534
1535/// Expand abbreviated inline image key names to full names
1536fn expand_inline_key(key: &str) -> String {
1537    match key {
1538        "W" => "Width".to_string(),
1539        "H" => "Height".to_string(),
1540        "CS" | "ColorSpace" => "ColorSpace".to_string(),
1541        "BPC" | "BitsPerComponent" => "BitsPerComponent".to_string(),
1542        "F" => "Filter".to_string(),
1543        "DP" | "DecodeParms" => "DecodeParms".to_string(),
1544        "IM" => "ImageMask".to_string(),
1545        "I" => "Interpolate".to_string(),
1546        "Intent" => "Intent".to_string(),
1547        "D" => "Decode".to_string(),
1548        _ => key.to_string(),
1549    }
1550}
1551
1552/// Expand abbreviated inline image color space names
1553fn expand_inline_name(name: &str) -> String {
1554    match name {
1555        "G" => "DeviceGray".to_string(),
1556        "RGB" => "DeviceRGB".to_string(),
1557        "CMYK" => "DeviceCMYK".to_string(),
1558        "I" => "Indexed".to_string(),
1559        "AHx" => "ASCIIHexDecode".to_string(),
1560        "A85" => "ASCII85Decode".to_string(),
1561        "LZW" => "LZWDecode".to_string(),
1562        "Fl" => "FlateDecode".to_string(),
1563        "RL" => "RunLengthDecode".to_string(),
1564        "DCT" => "DCTDecode".to_string(),
1565        "CCF" => "CCITTFaxDecode".to_string(),
1566        _ => name.to_string(),
1567    }
1568}
1569
1570#[cfg(test)]
1571mod tests {
1572    use super::*;
1573
1574    #[test]
1575    fn test_tokenize_numbers() {
1576        let input = b"123 -45 3.14159 -0.5 .5";
1577        let mut tokenizer = ContentTokenizer::new(input);
1578
1579        assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Integer(123)));
1580        assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Integer(-45)));
1581        assert_eq!(
1582            tokenizer.next_token().unwrap(),
1583            Some(Token::Number(3.14159))
1584        );
1585        assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(-0.5)));
1586        assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(0.5)));
1587        assert_eq!(tokenizer.next_token().unwrap(), None);
1588    }
1589
1590    #[test]
1591    fn test_tokenize_strings() {
1592        let input = b"(Hello World) (Hello\\nWorld) (Nested (paren))";
1593        let mut tokenizer = ContentTokenizer::new(input);
1594
1595        assert_eq!(
1596            tokenizer.next_token().unwrap(),
1597            Some(Token::String(b"Hello World".to_vec()))
1598        );
1599        assert_eq!(
1600            tokenizer.next_token().unwrap(),
1601            Some(Token::String(b"Hello\nWorld".to_vec()))
1602        );
1603        assert_eq!(
1604            tokenizer.next_token().unwrap(),
1605            Some(Token::String(b"Nested (paren)".to_vec()))
1606        );
1607    }
1608
1609    #[test]
1610    fn test_tokenize_hex_strings() {
1611        let input = b"<48656C6C6F> <48 65 6C 6C 6F>";
1612        let mut tokenizer = ContentTokenizer::new(input);
1613
1614        assert_eq!(
1615            tokenizer.next_token().unwrap(),
1616            Some(Token::HexString(b"Hello".to_vec()))
1617        );
1618        assert_eq!(
1619            tokenizer.next_token().unwrap(),
1620            Some(Token::HexString(b"Hello".to_vec()))
1621        );
1622    }
1623
1624    #[test]
1625    fn test_tokenize_names() {
1626        let input = b"/Name /Name#20with#20spaces /A#42C";
1627        let mut tokenizer = ContentTokenizer::new(input);
1628
1629        assert_eq!(
1630            tokenizer.next_token().unwrap(),
1631            Some(Token::Name("Name".to_string()))
1632        );
1633        assert_eq!(
1634            tokenizer.next_token().unwrap(),
1635            Some(Token::Name("Name with spaces".to_string()))
1636        );
1637        assert_eq!(
1638            tokenizer.next_token().unwrap(),
1639            Some(Token::Name("ABC".to_string()))
1640        );
1641    }
1642
1643    #[test]
1644    fn test_tokenize_operators() {
1645        let input = b"BT Tj ET q Q";
1646        let mut tokenizer = ContentTokenizer::new(input);
1647
1648        assert_eq!(
1649            tokenizer.next_token().unwrap(),
1650            Some(Token::Operator("BT".to_string()))
1651        );
1652        assert_eq!(
1653            tokenizer.next_token().unwrap(),
1654            Some(Token::Operator("Tj".to_string()))
1655        );
1656        assert_eq!(
1657            tokenizer.next_token().unwrap(),
1658            Some(Token::Operator("ET".to_string()))
1659        );
1660        assert_eq!(
1661            tokenizer.next_token().unwrap(),
1662            Some(Token::Operator("q".to_string()))
1663        );
1664        assert_eq!(
1665            tokenizer.next_token().unwrap(),
1666            Some(Token::Operator("Q".to_string()))
1667        );
1668    }
1669
1670    #[test]
1671    fn test_parse_text_operators() {
1672        let content = b"BT /F1 12 Tf 100 200 Td (Hello World) Tj ET";
1673        let operators = ContentParser::parse(content).unwrap();
1674
1675        assert_eq!(operators.len(), 5);
1676        assert_eq!(operators[0], ContentOperation::BeginText);
1677        assert_eq!(
1678            operators[1],
1679            ContentOperation::SetFont("F1".to_string(), 12.0)
1680        );
1681        assert_eq!(operators[2], ContentOperation::MoveText(100.0, 200.0));
1682        assert_eq!(
1683            operators[3],
1684            ContentOperation::ShowText(b"Hello World".to_vec())
1685        );
1686        assert_eq!(operators[4], ContentOperation::EndText);
1687    }
1688
1689    #[test]
1690    fn test_parse_graphics_operators() {
1691        let content = b"q 1 0 0 1 50 50 cm 2 w 0 0 100 100 re S Q";
1692        let operators = ContentParser::parse(content).unwrap();
1693
1694        assert_eq!(operators.len(), 6);
1695        assert_eq!(operators[0], ContentOperation::SaveGraphicsState);
1696        assert_eq!(
1697            operators[1],
1698            ContentOperation::SetTransformMatrix(1.0, 0.0, 0.0, 1.0, 50.0, 50.0)
1699        );
1700        assert_eq!(operators[2], ContentOperation::SetLineWidth(2.0));
1701        assert_eq!(
1702            operators[3],
1703            ContentOperation::Rectangle(0.0, 0.0, 100.0, 100.0)
1704        );
1705        assert_eq!(operators[4], ContentOperation::Stroke);
1706        assert_eq!(operators[5], ContentOperation::RestoreGraphicsState);
1707    }
1708
1709    #[test]
1710    fn test_parse_color_operators() {
1711        let content = b"0.5 g 1 0 0 rg 0 0 0 1 k";
1712        let operators = ContentParser::parse(content).unwrap();
1713
1714        assert_eq!(operators.len(), 3);
1715        assert_eq!(operators[0], ContentOperation::SetNonStrokingGray(0.5));
1716        assert_eq!(
1717            operators[1],
1718            ContentOperation::SetNonStrokingRGB(1.0, 0.0, 0.0)
1719        );
1720        assert_eq!(
1721            operators[2],
1722            ContentOperation::SetNonStrokingCMYK(0.0, 0.0, 0.0, 1.0)
1723        );
1724    }
1725
1726    // Comprehensive tests for all ContentOperation variants
1727    mod comprehensive_tests {
1728        use super::*;
1729
1730        #[test]
1731        fn test_all_text_operators() {
1732            // Test basic text operators that work with current parser
1733            let content = b"BT 5 Tc 10 Tw 120 Tz 15 TL /F1 12 Tf 1 Tr 5 Ts 100 200 Td 50 150 TD T* (Hello) Tj ET";
1734            let operators = ContentParser::parse(content).unwrap();
1735
1736            assert_eq!(operators[0], ContentOperation::BeginText);
1737            assert_eq!(operators[1], ContentOperation::SetCharSpacing(5.0));
1738            assert_eq!(operators[2], ContentOperation::SetWordSpacing(10.0));
1739            assert_eq!(operators[3], ContentOperation::SetHorizontalScaling(120.0));
1740            assert_eq!(operators[4], ContentOperation::SetLeading(15.0));
1741            assert_eq!(
1742                operators[5],
1743                ContentOperation::SetFont("F1".to_string(), 12.0)
1744            );
1745            assert_eq!(operators[6], ContentOperation::SetTextRenderMode(1));
1746            assert_eq!(operators[7], ContentOperation::SetTextRise(5.0));
1747            assert_eq!(operators[8], ContentOperation::MoveText(100.0, 200.0));
1748            assert_eq!(
1749                operators[9],
1750                ContentOperation::MoveTextSetLeading(50.0, 150.0)
1751            );
1752            assert_eq!(operators[10], ContentOperation::NextLine);
1753            assert_eq!(operators[11], ContentOperation::ShowText(b"Hello".to_vec()));
1754            assert_eq!(operators[12], ContentOperation::EndText);
1755        }
1756
1757        #[test]
1758        fn test_all_graphics_state_operators() {
1759            // Test basic graphics state operators without arrays
1760            let content = b"q Q 1 0 0 1 50 50 cm 2 w 1 J 2 j 10 M /GS1 gs 0.5 i /Perceptual ri";
1761            let operators = ContentParser::parse(content).unwrap();
1762
1763            assert_eq!(operators[0], ContentOperation::SaveGraphicsState);
1764            assert_eq!(operators[1], ContentOperation::RestoreGraphicsState);
1765            assert_eq!(
1766                operators[2],
1767                ContentOperation::SetTransformMatrix(1.0, 0.0, 0.0, 1.0, 50.0, 50.0)
1768            );
1769            assert_eq!(operators[3], ContentOperation::SetLineWidth(2.0));
1770            assert_eq!(operators[4], ContentOperation::SetLineCap(1));
1771            assert_eq!(operators[5], ContentOperation::SetLineJoin(2));
1772            assert_eq!(operators[6], ContentOperation::SetMiterLimit(10.0));
1773            assert_eq!(
1774                operators[7],
1775                ContentOperation::SetGraphicsStateParams("GS1".to_string())
1776            );
1777            assert_eq!(operators[8], ContentOperation::SetFlatness(0.5));
1778            assert_eq!(
1779                operators[9],
1780                ContentOperation::SetIntent("Perceptual".to_string())
1781            );
1782        }
1783
1784        #[test]
1785        fn test_all_path_construction_operators() {
1786            let content = b"100 200 m 150 200 l 200 200 250 250 300 200 c 250 180 300 200 v 200 180 300 200 y h 50 50 100 100 re";
1787            let operators = ContentParser::parse(content).unwrap();
1788
1789            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
1790            assert_eq!(operators[1], ContentOperation::LineTo(150.0, 200.0));
1791            assert_eq!(
1792                operators[2],
1793                ContentOperation::CurveTo(200.0, 200.0, 250.0, 250.0, 300.0, 200.0)
1794            );
1795            assert_eq!(
1796                operators[3],
1797                ContentOperation::CurveToV(250.0, 180.0, 300.0, 200.0)
1798            );
1799            assert_eq!(
1800                operators[4],
1801                ContentOperation::CurveToY(200.0, 180.0, 300.0, 200.0)
1802            );
1803            assert_eq!(operators[5], ContentOperation::ClosePath);
1804            assert_eq!(
1805                operators[6],
1806                ContentOperation::Rectangle(50.0, 50.0, 100.0, 100.0)
1807            );
1808        }
1809
1810        #[test]
1811        fn test_all_path_painting_operators() {
1812            let content = b"S s f F f* B B* b b* n W W*";
1813            let operators = ContentParser::parse(content).unwrap();
1814
1815            assert_eq!(operators[0], ContentOperation::Stroke);
1816            assert_eq!(operators[1], ContentOperation::CloseStroke);
1817            assert_eq!(operators[2], ContentOperation::Fill);
1818            assert_eq!(operators[3], ContentOperation::Fill); // F is alias for f
1819            assert_eq!(operators[4], ContentOperation::FillEvenOdd);
1820            assert_eq!(operators[5], ContentOperation::FillStroke);
1821            assert_eq!(operators[6], ContentOperation::FillStrokeEvenOdd);
1822            assert_eq!(operators[7], ContentOperation::CloseFillStroke);
1823            assert_eq!(operators[8], ContentOperation::CloseFillStrokeEvenOdd);
1824            assert_eq!(operators[9], ContentOperation::EndPath);
1825            assert_eq!(operators[10], ContentOperation::Clip);
1826            assert_eq!(operators[11], ContentOperation::ClipEvenOdd);
1827        }
1828
1829        #[test]
1830        fn test_all_color_operators() {
1831            // Test basic color operators that work with current parser
1832            let content = b"/DeviceRGB CS /DeviceGray cs 0.7 G 0.4 g 1 0 0 RG 0 1 0 rg 0 0 0 1 K 0.2 0.3 0.4 0.5 k /Shade1 sh";
1833            let operators = ContentParser::parse(content).unwrap();
1834
1835            assert_eq!(
1836                operators[0],
1837                ContentOperation::SetStrokingColorSpace("DeviceRGB".to_string())
1838            );
1839            assert_eq!(
1840                operators[1],
1841                ContentOperation::SetNonStrokingColorSpace("DeviceGray".to_string())
1842            );
1843            assert_eq!(operators[2], ContentOperation::SetStrokingGray(0.7));
1844            assert_eq!(operators[3], ContentOperation::SetNonStrokingGray(0.4));
1845            assert_eq!(
1846                operators[4],
1847                ContentOperation::SetStrokingRGB(1.0, 0.0, 0.0)
1848            );
1849            assert_eq!(
1850                operators[5],
1851                ContentOperation::SetNonStrokingRGB(0.0, 1.0, 0.0)
1852            );
1853            assert_eq!(
1854                operators[6],
1855                ContentOperation::SetStrokingCMYK(0.0, 0.0, 0.0, 1.0)
1856            );
1857            assert_eq!(
1858                operators[7],
1859                ContentOperation::SetNonStrokingCMYK(0.2, 0.3, 0.4, 0.5)
1860            );
1861            assert_eq!(
1862                operators[8],
1863                ContentOperation::ShadingFill("Shade1".to_string())
1864            );
1865        }
1866
1867        #[test]
1868        fn test_xobject_and_marked_content_operators() {
1869            // Test basic XObject and marked content operators
1870            let content = b"/Image1 Do /MC1 BMC EMC /MP1 MP BX EX";
1871            let operators = ContentParser::parse(content).unwrap();
1872
1873            assert_eq!(
1874                operators[0],
1875                ContentOperation::PaintXObject("Image1".to_string())
1876            );
1877            assert_eq!(
1878                operators[1],
1879                ContentOperation::BeginMarkedContent("MC1".to_string())
1880            );
1881            assert_eq!(operators[2], ContentOperation::EndMarkedContent);
1882            assert_eq!(
1883                operators[3],
1884                ContentOperation::DefineMarkedContentPoint("MP1".to_string())
1885            );
1886            assert_eq!(operators[4], ContentOperation::BeginCompatibility);
1887            assert_eq!(operators[5], ContentOperation::EndCompatibility);
1888        }
1889
1890        #[test]
1891        fn test_complex_content_stream() {
1892            let content = b"q 0.5 0 0 0.5 100 100 cm BT /F1 12 Tf 0 0 Td (Complex) Tj ET Q";
1893            let operators = ContentParser::parse(content).unwrap();
1894
1895            assert_eq!(operators.len(), 8);
1896            assert_eq!(operators[0], ContentOperation::SaveGraphicsState);
1897            assert_eq!(
1898                operators[1],
1899                ContentOperation::SetTransformMatrix(0.5, 0.0, 0.0, 0.5, 100.0, 100.0)
1900            );
1901            assert_eq!(operators[2], ContentOperation::BeginText);
1902            assert_eq!(
1903                operators[3],
1904                ContentOperation::SetFont("F1".to_string(), 12.0)
1905            );
1906            assert_eq!(operators[4], ContentOperation::MoveText(0.0, 0.0));
1907            assert_eq!(
1908                operators[5],
1909                ContentOperation::ShowText(b"Complex".to_vec())
1910            );
1911            assert_eq!(operators[6], ContentOperation::EndText);
1912            assert_eq!(operators[7], ContentOperation::RestoreGraphicsState);
1913        }
1914
1915        #[test]
1916        fn test_tokenizer_whitespace_handling() {
1917            let input = b"  \t\n\r  BT  \t\n  /F1   12.5  \t Tf  \n\r  ET  ";
1918            let mut tokenizer = ContentTokenizer::new(input);
1919
1920            assert_eq!(
1921                tokenizer.next_token().unwrap(),
1922                Some(Token::Operator("BT".to_string()))
1923            );
1924            assert_eq!(
1925                tokenizer.next_token().unwrap(),
1926                Some(Token::Name("F1".to_string()))
1927            );
1928            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(12.5)));
1929            assert_eq!(
1930                tokenizer.next_token().unwrap(),
1931                Some(Token::Operator("Tf".to_string()))
1932            );
1933            assert_eq!(
1934                tokenizer.next_token().unwrap(),
1935                Some(Token::Operator("ET".to_string()))
1936            );
1937            assert_eq!(tokenizer.next_token().unwrap(), None);
1938        }
1939
1940        #[test]
1941        fn test_tokenizer_edge_cases() {
1942            // Test basic number formats that are actually supported
1943            let input = b"0 .5 -.5 +.5 123. .123 1.23 -1.23";
1944            let mut tokenizer = ContentTokenizer::new(input);
1945
1946            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Integer(0)));
1947            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(0.5)));
1948            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(-0.5)));
1949            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(0.5)));
1950            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(123.0)));
1951            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(0.123)));
1952            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(1.23)));
1953            assert_eq!(tokenizer.next_token().unwrap(), Some(Token::Number(-1.23)));
1954        }
1955
1956        #[test]
1957        fn test_string_parsing_edge_cases() {
1958            let input = b"(Simple) (With\\\\backslash) (With\\)paren) (With\\newline) (With\\ttab) (With\\rcarriage) (With\\bbackspace) (With\\fformfeed) (With\\(leftparen) (With\\)rightparen) (With\\377octal) (With\\dddoctal)";
1959            let mut tokenizer = ContentTokenizer::new(input);
1960
1961            assert_eq!(
1962                tokenizer.next_token().unwrap(),
1963                Some(Token::String(b"Simple".to_vec()))
1964            );
1965            assert_eq!(
1966                tokenizer.next_token().unwrap(),
1967                Some(Token::String(b"With\\backslash".to_vec()))
1968            );
1969            assert_eq!(
1970                tokenizer.next_token().unwrap(),
1971                Some(Token::String(b"With)paren".to_vec()))
1972            );
1973            assert_eq!(
1974                tokenizer.next_token().unwrap(),
1975                Some(Token::String(b"With\newline".to_vec()))
1976            );
1977            assert_eq!(
1978                tokenizer.next_token().unwrap(),
1979                Some(Token::String(b"With\ttab".to_vec()))
1980            );
1981            assert_eq!(
1982                tokenizer.next_token().unwrap(),
1983                Some(Token::String(b"With\rcarriage".to_vec()))
1984            );
1985            assert_eq!(
1986                tokenizer.next_token().unwrap(),
1987                Some(Token::String(b"With\x08backspace".to_vec()))
1988            );
1989            assert_eq!(
1990                tokenizer.next_token().unwrap(),
1991                Some(Token::String(b"With\x0Cformfeed".to_vec()))
1992            );
1993            assert_eq!(
1994                tokenizer.next_token().unwrap(),
1995                Some(Token::String(b"With(leftparen".to_vec()))
1996            );
1997            assert_eq!(
1998                tokenizer.next_token().unwrap(),
1999                Some(Token::String(b"With)rightparen".to_vec()))
2000            );
2001        }
2002
2003        #[test]
2004        fn test_hex_string_parsing() {
2005            let input = b"<48656C6C6F> <48 65 6C 6C 6F> <48656C6C6F57> <48656C6C6F5>";
2006            let mut tokenizer = ContentTokenizer::new(input);
2007
2008            assert_eq!(
2009                tokenizer.next_token().unwrap(),
2010                Some(Token::HexString(b"Hello".to_vec()))
2011            );
2012            assert_eq!(
2013                tokenizer.next_token().unwrap(),
2014                Some(Token::HexString(b"Hello".to_vec()))
2015            );
2016            assert_eq!(
2017                tokenizer.next_token().unwrap(),
2018                Some(Token::HexString(b"HelloW".to_vec()))
2019            );
2020            assert_eq!(
2021                tokenizer.next_token().unwrap(),
2022                Some(Token::HexString(b"Hello\x50".to_vec()))
2023            );
2024        }
2025
2026        #[test]
2027        fn test_name_parsing_edge_cases() {
2028            let input = b"/Name /Name#20with#20spaces /Name#23with#23hash /Name#2Fwith#2Fslash /#45mptyName";
2029            let mut tokenizer = ContentTokenizer::new(input);
2030
2031            assert_eq!(
2032                tokenizer.next_token().unwrap(),
2033                Some(Token::Name("Name".to_string()))
2034            );
2035            assert_eq!(
2036                tokenizer.next_token().unwrap(),
2037                Some(Token::Name("Name with spaces".to_string()))
2038            );
2039            assert_eq!(
2040                tokenizer.next_token().unwrap(),
2041                Some(Token::Name("Name#with#hash".to_string()))
2042            );
2043            assert_eq!(
2044                tokenizer.next_token().unwrap(),
2045                Some(Token::Name("Name/with/slash".to_string()))
2046            );
2047            assert_eq!(
2048                tokenizer.next_token().unwrap(),
2049                Some(Token::Name("EmptyName".to_string()))
2050            );
2051        }
2052
2053        #[test]
2054        fn test_operator_parsing_edge_cases() {
2055            let content = b"q q q Q Q Q BT BT ET ET";
2056            let operators = ContentParser::parse(content).unwrap();
2057
2058            assert_eq!(operators.len(), 10);
2059            assert_eq!(operators[0], ContentOperation::SaveGraphicsState);
2060            assert_eq!(operators[1], ContentOperation::SaveGraphicsState);
2061            assert_eq!(operators[2], ContentOperation::SaveGraphicsState);
2062            assert_eq!(operators[3], ContentOperation::RestoreGraphicsState);
2063            assert_eq!(operators[4], ContentOperation::RestoreGraphicsState);
2064            assert_eq!(operators[5], ContentOperation::RestoreGraphicsState);
2065            assert_eq!(operators[6], ContentOperation::BeginText);
2066            assert_eq!(operators[7], ContentOperation::BeginText);
2067            assert_eq!(operators[8], ContentOperation::EndText);
2068            assert_eq!(operators[9], ContentOperation::EndText);
2069        }
2070
2071        #[test]
2072        fn test_error_handling_insufficient_operands() {
2073            let content = b"100 Td"; // Missing y coordinate
2074            let result = ContentParser::parse(content);
2075            assert!(result.is_err());
2076        }
2077
2078        #[test]
2079        fn test_error_handling_invalid_operator() {
2080            let content = b"100 200 INVALID";
2081            let result = ContentParser::parse(content);
2082            assert!(result.is_err());
2083        }
2084
2085        #[test]
2086        fn test_error_handling_malformed_string() {
2087            // Test that the tokenizer handles malformed strings appropriately
2088            let input = b"(Unclosed string";
2089            let mut tokenizer = ContentTokenizer::new(input);
2090            let result = tokenizer.next_token();
2091            // The current implementation may not detect this as an error
2092            // so we'll just test that we get some result
2093            assert!(result.is_ok() || result.is_err());
2094        }
2095
2096        #[test]
2097        fn test_error_handling_malformed_hex_string() {
2098            let input = b"<48656C6C6G>";
2099            let mut tokenizer = ContentTokenizer::new(input);
2100            let result = tokenizer.next_token();
2101            assert!(result.is_err());
2102        }
2103
2104        #[test]
2105        fn test_error_handling_malformed_name() {
2106            let input = b"/Name#GG";
2107            let mut tokenizer = ContentTokenizer::new(input);
2108            let result = tokenizer.next_token();
2109            assert!(result.is_err());
2110        }
2111
2112        #[test]
2113        fn test_empty_content_stream() {
2114            let content = b"";
2115            let operators = ContentParser::parse(content).unwrap();
2116            assert_eq!(operators.len(), 0);
2117        }
2118
2119        #[test]
2120        fn test_whitespace_only_content_stream() {
2121            let content = b"   \t\n\r   ";
2122            let operators = ContentParser::parse(content).unwrap();
2123            assert_eq!(operators.len(), 0);
2124        }
2125
2126        #[test]
2127        fn test_mixed_integer_and_real_operands() {
2128            // Test with simple operands that work with current parser
2129            let content = b"100 200 m 150 200 l";
2130            let operators = ContentParser::parse(content).unwrap();
2131
2132            assert_eq!(operators.len(), 2);
2133            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
2134            assert_eq!(operators[1], ContentOperation::LineTo(150.0, 200.0));
2135        }
2136
2137        #[test]
2138        fn test_negative_operands() {
2139            let content = b"-100 -200 Td -50.5 -75.2 TD";
2140            let operators = ContentParser::parse(content).unwrap();
2141
2142            assert_eq!(operators.len(), 2);
2143            assert_eq!(operators[0], ContentOperation::MoveText(-100.0, -200.0));
2144            assert_eq!(
2145                operators[1],
2146                ContentOperation::MoveTextSetLeading(-50.5, -75.2)
2147            );
2148        }
2149
2150        #[test]
2151        fn test_large_numbers() {
2152            let content = b"999999.999999 -999999.999999 m";
2153            let operators = ContentParser::parse(content).unwrap();
2154
2155            assert_eq!(operators.len(), 1);
2156            assert_eq!(
2157                operators[0],
2158                ContentOperation::MoveTo(999999.999999, -999999.999999)
2159            );
2160        }
2161
2162        #[test]
2163        fn test_scientific_notation() {
2164            // Test with simple decimal numbers since scientific notation isn't implemented
2165            let content = b"123.45 -456.78 m";
2166            let operators = ContentParser::parse(content).unwrap();
2167
2168            assert_eq!(operators.len(), 1);
2169            assert_eq!(operators[0], ContentOperation::MoveTo(123.45, -456.78));
2170        }
2171
2172        #[test]
2173        fn test_show_text_array_complex() {
2174            // Test simple text array without complex syntax
2175            let content = b"(Hello) TJ";
2176            let result = ContentParser::parse(content);
2177            // This should fail since TJ expects array, but test the error handling
2178            assert!(result.is_err());
2179        }
2180
2181        #[test]
2182        fn test_dash_pattern_empty() {
2183            // Test simple dash pattern without array syntax
2184            let content = b"0 d";
2185            let result = ContentParser::parse(content);
2186            // This should fail since dash pattern needs array, but test the error handling
2187            assert!(result.is_err());
2188        }
2189
2190        #[test]
2191        fn test_dash_pattern_complex() {
2192            // Test simple dash pattern without complex array syntax
2193            let content = b"2.5 d";
2194            let result = ContentParser::parse(content);
2195            // This should fail since dash pattern needs array, but test the error handling
2196            assert!(result.is_err());
2197        }
2198
2199        #[test]
2200        fn test_pop_array_removes_array_end() {
2201            // Test that pop_array correctly handles ArrayEnd tokens
2202            let parser = ContentParser::new(b"");
2203
2204            // Test normal array: [1 2 3]
2205            let mut operands = vec![
2206                Token::ArrayStart,
2207                Token::Integer(1),
2208                Token::Integer(2),
2209                Token::Integer(3),
2210                Token::ArrayEnd,
2211            ];
2212            let result = parser.pop_array(&mut operands).unwrap();
2213            assert_eq!(result.len(), 3);
2214            assert!(operands.is_empty());
2215
2216            // Test array without ArrayEnd (backwards compatibility)
2217            let mut operands = vec![Token::ArrayStart, Token::Number(1.5), Token::Number(2.5)];
2218            let result = parser.pop_array(&mut operands).unwrap();
2219            assert_eq!(result.len(), 2);
2220            assert!(operands.is_empty());
2221        }
2222
2223        #[test]
2224        fn test_dash_array_parsing_valid() {
2225            // Test that parser correctly parses valid dash arrays
2226            let parser = ContentParser::new(b"");
2227
2228            // Test with valid numbers only
2229            let valid_tokens = vec![Token::Number(3.0), Token::Integer(2)];
2230            let result = parser.parse_dash_array(valid_tokens).unwrap();
2231            assert_eq!(result, vec![3.0, 2.0]);
2232
2233            // Test empty dash array
2234            let empty_tokens = vec![];
2235            let result = parser.parse_dash_array(empty_tokens).unwrap();
2236            let expected: Vec<f32> = vec![];
2237            assert_eq!(result, expected);
2238        }
2239
2240        #[test]
2241        fn test_text_array_parsing_valid() {
2242            // Test that parser correctly parses valid text arrays
2243            let parser = ContentParser::new(b"");
2244
2245            // Test with valid elements only
2246            let valid_tokens = vec![
2247                Token::String(b"Hello".to_vec()),
2248                Token::Number(-100.0),
2249                Token::String(b"World".to_vec()),
2250            ];
2251            let result = parser.parse_text_array(valid_tokens).unwrap();
2252            assert_eq!(result.len(), 3);
2253        }
2254
2255        #[test]
2256        fn test_inline_image_handling() {
2257            let content = b"BI /W 100 /H 100 /BPC 8 /CS /RGB ID some_image_data EI";
2258            let operators = ContentParser::parse(content).unwrap();
2259
2260            assert_eq!(operators.len(), 1);
2261            match &operators[0] {
2262                ContentOperation::InlineImage { params, data: _ } => {
2263                    // Check parsed parameters
2264                    assert_eq!(params.get("Width"), Some(&Object::Integer(100)));
2265                    assert_eq!(params.get("Height"), Some(&Object::Integer(100)));
2266                    assert_eq!(params.get("BitsPerComponent"), Some(&Object::Integer(8)));
2267                    assert_eq!(
2268                        params.get("ColorSpace"),
2269                        Some(&Object::Name("DeviceRGB".to_string()))
2270                    );
2271                    // Data field is not captured, just verify params
2272                }
2273                _ => panic!("Expected InlineImage operation"),
2274            }
2275        }
2276
2277        #[test]
2278        fn test_inline_image_with_filter() {
2279            let content = b"BI /W 50 /H 50 /CS /G /BPC 1 /F /AHx ID 00FF00FF EI";
2280            let operators = ContentParser::parse(content).unwrap();
2281
2282            assert_eq!(operators.len(), 1);
2283            match &operators[0] {
2284                ContentOperation::InlineImage { params, data: _ } => {
2285                    assert_eq!(params.get("Width"), Some(&Object::Integer(50)));
2286                    assert_eq!(params.get("Height"), Some(&Object::Integer(50)));
2287                    assert_eq!(
2288                        params.get("ColorSpace"),
2289                        Some(&Object::Name("DeviceGray".to_string()))
2290                    );
2291                    assert_eq!(params.get("BitsPerComponent"), Some(&Object::Integer(1)));
2292                    assert_eq!(
2293                        params.get("Filter"),
2294                        Some(&Object::Name("ASCIIHexDecode".to_string()))
2295                    );
2296                }
2297                _ => panic!("Expected InlineImage operation"),
2298            }
2299        }
2300
2301        #[test]
2302        fn test_content_parser_performance() {
2303            let mut content = Vec::new();
2304            for i in 0..1000 {
2305                content.extend_from_slice(format!("{} {} m ", i, i + 1).as_bytes());
2306            }
2307
2308            let start = std::time::Instant::now();
2309            let operators = ContentParser::parse(&content).unwrap();
2310            let duration = start.elapsed();
2311
2312            assert_eq!(operators.len(), 1000);
2313            assert!(duration.as_millis() < 100); // Should parse 1000 operators in under 100ms
2314        }
2315
2316        #[test]
2317        fn test_tokenizer_performance() {
2318            let mut input = Vec::new();
2319            for i in 0..1000 {
2320                input.extend_from_slice(format!("{} {} ", i, i + 1).as_bytes());
2321            }
2322
2323            let start = std::time::Instant::now();
2324            let mut tokenizer = ContentTokenizer::new(&input);
2325            let mut count = 0;
2326            while tokenizer.next_token().unwrap().is_some() {
2327                count += 1;
2328            }
2329            let duration = start.elapsed();
2330
2331            assert_eq!(count, 2000); // 1000 pairs of numbers
2332            assert!(duration.as_millis() < 50); // Should tokenize 2000 tokens in under 50ms
2333        }
2334
2335        #[test]
2336        fn test_memory_usage_large_content() {
2337            let mut content = Vec::new();
2338            for i in 0..10000 {
2339                content.extend_from_slice(
2340                    format!("{} {} {} {} {} {} c ", i, i + 1, i + 2, i + 3, i + 4, i + 5)
2341                        .as_bytes(),
2342                );
2343            }
2344
2345            let operators = ContentParser::parse(&content).unwrap();
2346            assert_eq!(operators.len(), 10000);
2347
2348            // Verify all operations are CurveTo
2349            for op in operators {
2350                matches!(op, ContentOperation::CurveTo(_, _, _, _, _, _));
2351            }
2352        }
2353
2354        #[test]
2355        fn test_concurrent_parsing() {
2356            use std::sync::Arc;
2357            use std::thread;
2358
2359            let content = Arc::new(b"BT /F1 12 Tf 100 200 Td (Hello) Tj ET".to_vec());
2360            let handles: Vec<_> = (0..10)
2361                .map(|_| {
2362                    let content_clone = content.clone();
2363                    thread::spawn(move || ContentParser::parse(&content_clone).unwrap())
2364                })
2365                .collect();
2366
2367            for handle in handles {
2368                let operators = handle.join().unwrap();
2369                assert_eq!(operators.len(), 5);
2370                assert_eq!(operators[0], ContentOperation::BeginText);
2371                assert_eq!(operators[4], ContentOperation::EndText);
2372            }
2373        }
2374
2375        // ========== NEW COMPREHENSIVE TESTS ==========
2376
2377        #[test]
2378        fn test_tokenizer_hex_string_edge_cases() {
2379            let mut tokenizer = ContentTokenizer::new(b"<>");
2380            let token = tokenizer.next_token().unwrap().unwrap();
2381            match token {
2382                Token::HexString(data) => assert!(data.is_empty()),
2383                _ => panic!("Expected empty hex string"),
2384            }
2385
2386            // Odd number of hex digits
2387            let mut tokenizer = ContentTokenizer::new(b"<123>");
2388            let token = tokenizer.next_token().unwrap().unwrap();
2389            match token {
2390                Token::HexString(data) => assert_eq!(data, vec![0x12, 0x30]),
2391                _ => panic!("Expected hex string with odd digits"),
2392            }
2393
2394            // Hex string with whitespace
2395            let mut tokenizer = ContentTokenizer::new(b"<12 34\t56\n78>");
2396            let token = tokenizer.next_token().unwrap().unwrap();
2397            match token {
2398                Token::HexString(data) => assert_eq!(data, vec![0x12, 0x34, 0x56, 0x78]),
2399                _ => panic!("Expected hex string with whitespace"),
2400            }
2401        }
2402
2403        #[test]
2404        fn test_tokenizer_literal_string_escape_sequences() {
2405            // Test all standard escape sequences
2406            let mut tokenizer = ContentTokenizer::new(b"(\\n\\r\\t\\b\\f\\(\\)\\\\)");
2407            let token = tokenizer.next_token().unwrap().unwrap();
2408            match token {
2409                Token::String(data) => {
2410                    assert_eq!(
2411                        data,
2412                        vec![b'\n', b'\r', b'\t', 0x08, 0x0C, b'(', b')', b'\\']
2413                    );
2414                }
2415                _ => panic!("Expected string with escapes"),
2416            }
2417
2418            // Test octal escape sequences
2419            let mut tokenizer = ContentTokenizer::new(b"(\\101\\040\\377)");
2420            let token = tokenizer.next_token().unwrap().unwrap();
2421            match token {
2422                Token::String(data) => assert_eq!(data, vec![b'A', b' ', 255]),
2423                _ => panic!("Expected string with octal escapes"),
2424            }
2425        }
2426
2427        #[test]
2428        fn test_tokenizer_nested_parentheses() {
2429            let mut tokenizer = ContentTokenizer::new(b"(outer (inner) text)");
2430            let token = tokenizer.next_token().unwrap().unwrap();
2431            match token {
2432                Token::String(data) => {
2433                    assert_eq!(data, b"outer (inner) text");
2434                }
2435                _ => panic!("Expected string with nested parentheses"),
2436            }
2437
2438            // Multiple levels of nesting
2439            let mut tokenizer = ContentTokenizer::new(b"(level1 (level2 (level3) back2) back1)");
2440            let token = tokenizer.next_token().unwrap().unwrap();
2441            match token {
2442                Token::String(data) => {
2443                    assert_eq!(data, b"level1 (level2 (level3) back2) back1");
2444                }
2445                _ => panic!("Expected string with deep nesting"),
2446            }
2447        }
2448
2449        #[test]
2450        fn test_tokenizer_name_hex_escapes() {
2451            let mut tokenizer = ContentTokenizer::new(b"/Name#20With#20Spaces");
2452            let token = tokenizer.next_token().unwrap().unwrap();
2453            match token {
2454                Token::Name(name) => assert_eq!(name, "Name With Spaces"),
2455                _ => panic!("Expected name with hex escapes"),
2456            }
2457
2458            // Test various special characters
2459            let mut tokenizer = ContentTokenizer::new(b"/Special#2F#28#29#3C#3E");
2460            let token = tokenizer.next_token().unwrap().unwrap();
2461            match token {
2462                Token::Name(name) => assert_eq!(name, "Special/()<>"),
2463                _ => panic!("Expected name with special character escapes"),
2464            }
2465        }
2466
2467        #[test]
2468        fn test_tokenizer_number_edge_cases() {
2469            // Very large integers
2470            let mut tokenizer = ContentTokenizer::new(b"2147483647");
2471            let token = tokenizer.next_token().unwrap().unwrap();
2472            match token {
2473                Token::Integer(n) => assert_eq!(n, 2147483647),
2474                _ => panic!("Expected large integer"),
2475            }
2476
2477            // Very small numbers
2478            let mut tokenizer = ContentTokenizer::new(b"0.00001");
2479            let token = tokenizer.next_token().unwrap().unwrap();
2480            match token {
2481                Token::Number(n) => assert!((n - 0.00001).abs() < f32::EPSILON),
2482                _ => panic!("Expected small float"),
2483            }
2484
2485            // Numbers starting with dot
2486            let mut tokenizer = ContentTokenizer::new(b".5");
2487            let token = tokenizer.next_token().unwrap().unwrap();
2488            match token {
2489                Token::Number(n) => assert!((n - 0.5).abs() < f32::EPSILON),
2490                _ => panic!("Expected float starting with dot"),
2491            }
2492        }
2493
2494        #[test]
2495        fn test_parser_complex_path_operations() {
2496            let content = b"100 200 m 150 200 l 150 250 l 100 250 l h f";
2497            let operators = ContentParser::parse(content).unwrap();
2498
2499            assert_eq!(operators.len(), 6);
2500            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
2501            assert_eq!(operators[1], ContentOperation::LineTo(150.0, 200.0));
2502            assert_eq!(operators[2], ContentOperation::LineTo(150.0, 250.0));
2503            assert_eq!(operators[3], ContentOperation::LineTo(100.0, 250.0));
2504            assert_eq!(operators[4], ContentOperation::ClosePath);
2505            assert_eq!(operators[5], ContentOperation::Fill);
2506        }
2507
2508        #[test]
2509        fn test_parser_bezier_curves() {
2510            let content = b"100 100 150 50 200 150 c";
2511            let operators = ContentParser::parse(content).unwrap();
2512
2513            assert_eq!(operators.len(), 1);
2514            match &operators[0] {
2515                ContentOperation::CurveTo(x1, y1, x2, y2, x3, y3) => {
2516                    // Values are parsed in reverse order: last 6 values for c operator
2517                    // Stack order: 100 100 150 50 200 150
2518                    // Pop order: x1=100, y1=100, x2=150, y2=50, x3=200, y3=150
2519                    assert!(x1.is_finite() && y1.is_finite());
2520                    assert!(x2.is_finite() && y2.is_finite());
2521                    assert!(x3.is_finite() && y3.is_finite());
2522                    // Verify we have 6 coordinate values
2523                    assert!(*x1 >= 50.0 && *x1 <= 200.0);
2524                    assert!(*y1 >= 50.0 && *y1 <= 200.0);
2525                }
2526                _ => panic!("Expected CurveTo operation"),
2527            }
2528        }
2529
2530        #[test]
2531        fn test_parser_color_operations() {
2532            let content = b"0.5 g 1 0 0 rg 0 1 0 1 k /DeviceRGB cs 0.2 0.4 0.6 sc";
2533            let operators = ContentParser::parse(content).unwrap();
2534
2535            assert_eq!(operators.len(), 5);
2536            match &operators[0] {
2537                ContentOperation::SetNonStrokingGray(gray) => assert_eq!(*gray, 0.5),
2538                _ => panic!("Expected SetNonStrokingGray"),
2539            }
2540            match &operators[1] {
2541                ContentOperation::SetNonStrokingRGB(r, g, b) => {
2542                    assert_eq!((*r, *g, *b), (1.0, 0.0, 0.0));
2543                }
2544                _ => panic!("Expected SetNonStrokingRGB"),
2545            }
2546        }
2547
2548        #[test]
2549        fn test_parser_text_positioning_advanced() {
2550            let content = b"BT 1 0 0 1 100 200 Tm 0 TL 10 TL (Line 1) ' (Line 2) ' ET";
2551            let operators = ContentParser::parse(content).unwrap();
2552
2553            assert_eq!(operators.len(), 7);
2554            assert_eq!(operators[0], ContentOperation::BeginText);
2555            match &operators[1] {
2556                ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
2557                    assert_eq!((*a, *b, *c, *d, *e, *f), (1.0, 0.0, 0.0, 1.0, 100.0, 200.0));
2558                }
2559                _ => panic!("Expected SetTextMatrix"),
2560            }
2561            assert_eq!(operators[6], ContentOperation::EndText);
2562        }
2563
2564        #[test]
2565        fn test_parser_graphics_state_operations() {
2566            let content = b"q 2 0 0 2 100 100 cm 5 w 1 J 2 j 10 M Q";
2567            let operators = ContentParser::parse(content).unwrap();
2568
2569            assert_eq!(operators.len(), 7);
2570            assert_eq!(operators[0], ContentOperation::SaveGraphicsState);
2571            match &operators[1] {
2572                ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
2573                    assert_eq!((*a, *b, *c, *d, *e, *f), (2.0, 0.0, 0.0, 2.0, 100.0, 100.0));
2574                }
2575                _ => panic!("Expected SetTransformMatrix"),
2576            }
2577            assert_eq!(operators[6], ContentOperation::RestoreGraphicsState);
2578        }
2579
2580        #[test]
2581        fn test_parser_xobject_operations() {
2582            let content = b"/Image1 Do /Form2 Do /Pattern3 Do";
2583            let operators = ContentParser::parse(content).unwrap();
2584
2585            assert_eq!(operators.len(), 3);
2586            for (i, expected_name) in ["Image1", "Form2", "Pattern3"].iter().enumerate() {
2587                match &operators[i] {
2588                    ContentOperation::PaintXObject(name) => assert_eq!(name, expected_name),
2589                    _ => panic!("Expected PaintXObject"),
2590                }
2591            }
2592        }
2593
2594        #[test]
2595        fn test_parser_marked_content_operations() {
2596            let content = b"/P BMC (Tagged content) Tj EMC";
2597            let operators = ContentParser::parse(content).unwrap();
2598
2599            assert_eq!(operators.len(), 3);
2600            match &operators[0] {
2601                ContentOperation::BeginMarkedContent(tag) => assert_eq!(tag, "P"),
2602                _ => panic!("Expected BeginMarkedContent"),
2603            }
2604            assert_eq!(operators[2], ContentOperation::EndMarkedContent);
2605        }
2606
2607        #[test]
2608        fn test_parser_error_handling_invalid_operators() {
2609            // Missing operands for move operator
2610            let content = b"m";
2611            let result = ContentParser::parse(content);
2612            assert!(result.is_err());
2613
2614            // Invalid hex string (no closing >)
2615            let content = b"<ABC DEF BT";
2616            let result = ContentParser::parse(content);
2617            assert!(result.is_err());
2618
2619            // Test that we can detect actual parsing errors
2620            let content = b"100 200 300"; // Numbers without operator should parse ok
2621            let result = ContentParser::parse(content);
2622            assert!(result.is_ok()); // This should actually be ok since no operator is attempted
2623        }
2624
2625        #[test]
2626        fn test_parser_whitespace_tolerance() {
2627            let content = b"  \n\t  100   \r\n  200  \t m  \n";
2628            let operators = ContentParser::parse(content).unwrap();
2629
2630            assert_eq!(operators.len(), 1);
2631            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
2632        }
2633
2634        #[test]
2635        fn test_tokenizer_comment_handling() {
2636            let content = b"100 % This is a comment\n200 m % Another comment";
2637            let operators = ContentParser::parse(content).unwrap();
2638
2639            assert_eq!(operators.len(), 1);
2640            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
2641        }
2642
2643        #[test]
2644        fn test_parser_stream_with_binary_data() {
2645            // Test content stream with comment containing binary-like data
2646            let content = b"100 200 m % Comment with \xFF binary\n150 250 l";
2647
2648            let operators = ContentParser::parse(content).unwrap();
2649            assert_eq!(operators.len(), 2);
2650            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
2651            assert_eq!(operators[1], ContentOperation::LineTo(150.0, 250.0));
2652        }
2653
2654        #[test]
2655        fn test_tokenizer_array_parsing() {
2656            // Test simple operations that don't require complex array parsing
2657            let content = b"100 200 m 150 250 l";
2658            let operators = ContentParser::parse(content).unwrap();
2659
2660            assert_eq!(operators.len(), 2);
2661            assert_eq!(operators[0], ContentOperation::MoveTo(100.0, 200.0));
2662            assert_eq!(operators[1], ContentOperation::LineTo(150.0, 250.0));
2663        }
2664
2665        #[test]
2666        fn test_parser_rectangle_operations() {
2667            let content = b"10 20 100 50 re 0 0 200 300 re";
2668            let operators = ContentParser::parse(content).unwrap();
2669
2670            assert_eq!(operators.len(), 2);
2671            match &operators[0] {
2672                ContentOperation::Rectangle(x, y, width, height) => {
2673                    assert_eq!((*x, *y, *width, *height), (10.0, 20.0, 100.0, 50.0));
2674                }
2675                _ => panic!("Expected Rectangle operation"),
2676            }
2677            match &operators[1] {
2678                ContentOperation::Rectangle(x, y, width, height) => {
2679                    assert_eq!((*x, *y, *width, *height), (0.0, 0.0, 200.0, 300.0));
2680                }
2681                _ => panic!("Expected Rectangle operation"),
2682            }
2683        }
2684
2685        #[test]
2686        fn test_parser_clipping_operations() {
2687            let content = b"100 100 50 50 re W n 200 200 75 75 re W* n";
2688            let operators = ContentParser::parse(content).unwrap();
2689
2690            assert_eq!(operators.len(), 6);
2691            assert_eq!(operators[1], ContentOperation::Clip);
2692            assert_eq!(operators[2], ContentOperation::EndPath);
2693            assert_eq!(operators[4], ContentOperation::ClipEvenOdd);
2694            assert_eq!(operators[5], ContentOperation::EndPath);
2695        }
2696
2697        #[test]
2698        fn test_parser_painting_operations() {
2699            let content = b"S s f f* B B* b b*";
2700            let operators = ContentParser::parse(content).unwrap();
2701
2702            assert_eq!(operators.len(), 8);
2703            assert_eq!(operators[0], ContentOperation::Stroke);
2704            assert_eq!(operators[1], ContentOperation::CloseStroke);
2705            assert_eq!(operators[2], ContentOperation::Fill);
2706            assert_eq!(operators[3], ContentOperation::FillEvenOdd);
2707            assert_eq!(operators[4], ContentOperation::FillStroke);
2708            assert_eq!(operators[5], ContentOperation::FillStrokeEvenOdd);
2709            assert_eq!(operators[6], ContentOperation::CloseFillStroke);
2710            assert_eq!(operators[7], ContentOperation::CloseFillStrokeEvenOdd);
2711        }
2712
2713        #[test]
2714        fn test_parser_line_style_operations() {
2715            let content = b"5 w 1 J 2 j 10 M [ 3 2 ] 0 d";
2716            let operators = ContentParser::parse(content).unwrap();
2717
2718            assert_eq!(operators.len(), 5);
2719            assert_eq!(operators[0], ContentOperation::SetLineWidth(5.0));
2720            assert_eq!(operators[1], ContentOperation::SetLineCap(1));
2721            assert_eq!(operators[2], ContentOperation::SetLineJoin(2));
2722            assert_eq!(operators[3], ContentOperation::SetMiterLimit(10.0));
2723            // Dash pattern test would need array support
2724        }
2725
2726        #[test]
2727        fn test_parser_text_state_operations() {
2728            let content = b"12 Tc 3 Tw 100 Tz 1 Tr 2 Ts";
2729            let operators = ContentParser::parse(content).unwrap();
2730
2731            assert_eq!(operators.len(), 5);
2732            assert_eq!(operators[0], ContentOperation::SetCharSpacing(12.0));
2733            assert_eq!(operators[1], ContentOperation::SetWordSpacing(3.0));
2734            assert_eq!(operators[2], ContentOperation::SetHorizontalScaling(100.0));
2735            assert_eq!(operators[3], ContentOperation::SetTextRenderMode(1));
2736            assert_eq!(operators[4], ContentOperation::SetTextRise(2.0));
2737        }
2738
2739        #[test]
2740        fn test_parser_unicode_text() {
2741            let content = b"BT (Hello \xC2\xA9 World \xE2\x9C\x93) Tj ET";
2742            let operators = ContentParser::parse(content).unwrap();
2743
2744            assert_eq!(operators.len(), 3);
2745            assert_eq!(operators[0], ContentOperation::BeginText);
2746            match &operators[1] {
2747                ContentOperation::ShowText(text) => {
2748                    assert!(text.len() > 5); // Should contain Unicode bytes
2749                }
2750                _ => panic!("Expected ShowText operation"),
2751            }
2752            assert_eq!(operators[2], ContentOperation::EndText);
2753        }
2754
2755        #[test]
2756        fn test_parser_stress_test_large_coordinates() {
2757            let content = b"999999.999 -999999.999 999999.999 -999999.999 999999.999 -999999.999 c";
2758            let operators = ContentParser::parse(content).unwrap();
2759
2760            assert_eq!(operators.len(), 1);
2761            match &operators[0] {
2762                ContentOperation::CurveTo(_x1, _y1, _x2, _y2, _x3, _y3) => {
2763                    assert!((*_x1 - 999999.999).abs() < 0.1);
2764                    assert!((*_y1 - (-999999.999)).abs() < 0.1);
2765                    assert!((*_x3 - 999999.999).abs() < 0.1);
2766                }
2767                _ => panic!("Expected CurveTo operation"),
2768            }
2769        }
2770
2771        #[test]
2772        fn test_parser_empty_content_stream() {
2773            let content = b"";
2774            let operators = ContentParser::parse(content).unwrap();
2775            assert!(operators.is_empty());
2776
2777            let content = b"   \n\t\r   ";
2778            let operators = ContentParser::parse(content).unwrap();
2779            assert!(operators.is_empty());
2780        }
2781
2782        #[test]
2783        fn test_tokenizer_error_recovery() {
2784            // Test that parser can handle malformed but recoverable content
2785            let content = b"100 200 m % Comment with\xFFbinary\n150 250 l";
2786            let result = ContentParser::parse(content);
2787            // Should either parse successfully or fail gracefully
2788            assert!(result.is_ok() || result.is_err());
2789        }
2790
2791        #[test]
2792        fn test_parser_optimization_repeated_operations() {
2793            // Test performance with many repeated operations
2794            let mut content = Vec::new();
2795            for i in 0..1000 {
2796                content.extend_from_slice(format!("{} {} m ", i, i * 2).as_bytes());
2797            }
2798
2799            let start = std::time::Instant::now();
2800            let operators = ContentParser::parse(&content).unwrap();
2801            let duration = start.elapsed();
2802
2803            assert_eq!(operators.len(), 1000);
2804            assert!(duration.as_millis() < 200); // Should be fast
2805        }
2806
2807        #[test]
2808        fn test_parser_memory_efficiency_large_strings() {
2809            // Test with large text content
2810            let large_text = "A".repeat(10000);
2811            let content = format!("BT ({}) Tj ET", large_text);
2812            let operators = ContentParser::parse(content.as_bytes()).unwrap();
2813
2814            assert_eq!(operators.len(), 3);
2815            match &operators[1] {
2816                ContentOperation::ShowText(text) => {
2817                    assert_eq!(text.len(), 10000);
2818                }
2819                _ => panic!("Expected ShowText operation"),
2820            }
2821        }
2822    }
2823
2824    #[test]
2825    fn test_content_stream_too_large() {
2826        // Test handling of very large content streams (covering potential size limits)
2827        let mut large_content = Vec::new();
2828
2829        // Create a content stream with many operations
2830        for i in 0..10000 {
2831            large_content.extend_from_slice(format!("{} {} m ", i, i).as_bytes());
2832        }
2833        large_content.extend_from_slice(b"S");
2834
2835        // Should handle large content without panic
2836        let result = ContentParser::parse_content(&large_content);
2837        assert!(result.is_ok());
2838
2839        let operations = result.unwrap();
2840        // Should have many MoveTo operations plus one Stroke
2841        assert!(operations.len() > 10000);
2842    }
2843
2844    #[test]
2845    fn test_invalid_operator_handling() {
2846        // Test parsing with invalid operators
2847        let content = b"100 200 INVALID_OP 300 400 m";
2848        let result = ContentParser::parse_content(content);
2849
2850        // Should either handle gracefully or return error
2851        if let Ok(operations) = result {
2852            // If it succeeds, should have at least the valid MoveTo
2853            assert!(operations
2854                .iter()
2855                .any(|op| matches!(op, ContentOperation::MoveTo(_, _))));
2856        }
2857    }
2858
2859    #[test]
2860    fn test_nested_arrays_malformed() {
2861        // Test malformed nested arrays in TJ operator
2862        let content = b"[[(Hello] [World)]] TJ";
2863        let result = ContentParser::parse_content(content);
2864
2865        // Should handle malformed arrays gracefully
2866        assert!(result.is_ok() || result.is_err());
2867    }
2868
2869    #[test]
2870    fn test_escape_sequences_in_strings() {
2871        // Test various escape sequences in strings
2872        let test_cases = vec![
2873            (b"(\\n\\r\\t)".as_slice(), b"\n\r\t".as_slice()),
2874            (b"(\\\\)".as_slice(), b"\\".as_slice()),
2875            (b"(\\(\\))".as_slice(), b"()".as_slice()),
2876            (b"(\\123)".as_slice(), b"S".as_slice()), // Octal 123 = 83 = 'S'
2877            (b"(\\0)".as_slice(), b"\0".as_slice()),
2878        ];
2879
2880        for (input, expected) in test_cases {
2881            let mut content = Vec::new();
2882            content.extend_from_slice(input);
2883            content.extend_from_slice(b" Tj");
2884
2885            let result = ContentParser::parse_content(&content);
2886            assert!(result.is_ok());
2887
2888            let operations = result.unwrap();
2889            if let ContentOperation::ShowText(text) = &operations[0] {
2890                assert_eq!(text, expected, "Failed for input: {:?}", input);
2891            } else {
2892                panic!("Expected ShowText operation");
2893            }
2894        }
2895    }
2896
2897    #[test]
2898    fn test_content_with_inline_images() {
2899        // Test handling of inline images in content stream
2900        let content = b"BI /W 10 /H 10 /CS /RGB ID \x00\x01\x02\x03 EI";
2901        let result = ContentParser::parse_content(content);
2902
2903        // Should handle inline images (even if not fully implemented)
2904        assert!(result.is_ok() || result.is_err());
2905    }
2906
2907    #[test]
2908    fn test_operator_with_missing_operands() {
2909        // Test operators with insufficient operands
2910        let test_cases = vec![
2911            b"Tj" as &[u8], // ShowText without string
2912            b"m",           // MoveTo without coordinates
2913            b"rg",          // SetRGBColor without values
2914            b"Tf",          // SetFont without name and size
2915        ];
2916
2917        for content in test_cases {
2918            let result = ContentParser::parse_content(content);
2919            // Should handle gracefully (error or skip)
2920            assert!(result.is_ok() || result.is_err());
2921        }
2922    }
2923
2924    // --- Tests for infinite loop fix (curly braces, stray parens, inline images) ---
2925
2926    #[test]
2927    fn test_tokenizer_handles_curly_braces() {
2928        // Curly braces { } are not valid PDF content operators but appear in
2929        // binary inline image data. The tokenizer must skip them without hanging.
2930        let input = b"q { } Q";
2931        let mut tokenizer = ContentTokenizer::new(input);
2932
2933        let mut tokens = Vec::new();
2934        while let Some(token) = tokenizer.next_token().unwrap() {
2935            tokens.push(token);
2936        }
2937
2938        // Should produce tokens for q and Q, skipping { and }
2939        assert!(tokens.contains(&Token::Operator("q".to_string())));
2940        assert!(tokens.contains(&Token::Operator("Q".to_string())));
2941    }
2942
2943    #[test]
2944    fn test_tokenizer_handles_closing_paren() {
2945        // A stray ) outside a string literal should be skipped, not cause a hang
2946        let input = b"q ) Q";
2947        let mut tokenizer = ContentTokenizer::new(input);
2948
2949        let mut tokens = Vec::new();
2950        while let Some(token) = tokenizer.next_token().unwrap() {
2951            tokens.push(token);
2952        }
2953
2954        assert!(tokens.contains(&Token::Operator("q".to_string())));
2955        assert!(tokens.contains(&Token::Operator("Q".to_string())));
2956    }
2957
2958    #[test]
2959    fn test_inline_image_binary_with_curly_braces() {
2960        // Inline image binary data containing { and } bytes must be handled
2961        // correctly — the tokenizer should capture them as raw image data
2962        let content = b"BI /W 2 /H 2 /BPC 8 /CS /G ID \x7B\x7D\x00\xFF EI Q";
2963        let result = ContentParser::parse_content(content);
2964        assert!(
2965            result.is_ok(),
2966            "Parsing inline image with curly braces failed: {:?}",
2967            result.err()
2968        );
2969
2970        let ops = result.unwrap();
2971        // Should have InlineImage + RestoreGraphicsState
2972        let has_inline = ops
2973            .iter()
2974            .any(|op| matches!(op, ContentOperation::InlineImage { .. }));
2975        let has_q = ops
2976            .iter()
2977            .any(|op| matches!(op, ContentOperation::RestoreGraphicsState));
2978        assert!(has_inline, "Expected InlineImage operation");
2979        assert!(has_q, "Expected RestoreGraphicsState after EI");
2980    }
2981
2982    #[test]
2983    fn test_inline_image_binary_with_all_byte_values() {
2984        // Inline image with bytes 0x00-0xFF to ensure no byte causes a hang
2985        let mut content = Vec::new();
2986        content.extend_from_slice(b"BI /W 16 /H 16 /BPC 8 /CS /G ID ");
2987        // Add all 256 byte values as image data
2988        for b in 0u8..=255 {
2989            content.push(b);
2990        }
2991        content.extend_from_slice(b" EI Q");
2992
2993        let result = ContentParser::parse_content(&content);
2994        assert!(
2995            result.is_ok(),
2996            "Parsing inline image with all byte values failed: {:?}",
2997            result.err()
2998        );
2999    }
3000
3001    #[test]
3002    fn test_inline_image_ei_detection() {
3003        // EI must be preceded by whitespace to be recognized as end marker
3004        // "EI" within binary data (not preceded by whitespace) should NOT end the image
3005        let content = b"BI /W 2 /H 1 /BPC 8 /CS /G ID \x45\x49\x00\n EI Q";
3006        //                                               ^E  ^I  (within data)  ^real EI
3007        let result = ContentParser::parse_content(content);
3008        assert!(result.is_ok(), "EI detection failed: {:?}", result.err());
3009
3010        let ops = result.unwrap();
3011        let has_inline = ops
3012            .iter()
3013            .any(|op| matches!(op, ContentOperation::InlineImage { .. }));
3014        assert!(has_inline, "Expected InlineImage operation");
3015    }
3016
3017    #[test]
3018    fn test_tokenizer_no_infinite_loop_on_consecutive_delimiters() {
3019        // Multiple consecutive unhandled delimiters must not cause a hang
3020        let input = b"q {{{}}})))) Q";
3021        let mut tokenizer = ContentTokenizer::new(input);
3022
3023        let mut tokens = Vec::new();
3024        while let Some(token) = tokenizer.next_token().unwrap() {
3025            tokens.push(token);
3026            if tokens.len() > 100 {
3027                panic!("Tokenizer produced too many tokens — possible infinite loop");
3028            }
3029        }
3030
3031        assert!(tokens.contains(&Token::Operator("q".to_string())));
3032        assert!(tokens.contains(&Token::Operator("Q".to_string())));
3033    }
3034
3035    #[test]
3036    fn test_content_parser_inline_image_produces_correct_operation() {
3037        // Full parse of a simple inline image should produce correct params
3038        let content = b"BI /W 4 /H 4 /BPC 8 /CS /G ID \x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F EI";
3039        let result = ContentParser::parse_content(content);
3040        assert!(result.is_ok(), "Parse failed: {:?}", result.err());
3041
3042        let ops = result.unwrap();
3043        assert_eq!(
3044            ops.len(),
3045            1,
3046            "Expected exactly 1 operation, got {}",
3047            ops.len()
3048        );
3049
3050        if let ContentOperation::InlineImage { params, data } = &ops[0] {
3051            assert_eq!(params.get("Width"), Some(&Object::Integer(4)));
3052            assert_eq!(params.get("Height"), Some(&Object::Integer(4)));
3053            assert_eq!(params.get("BitsPerComponent"), Some(&Object::Integer(8)));
3054            assert!(!data.is_empty(), "Image data should not be empty");
3055        } else {
3056            panic!("Expected InlineImage operation, got {:?}", ops[0]);
3057        }
3058    }
3059
3060    #[test]
3061    fn test_octal_escape_overflow_777() {
3062        // \777 = octal 777 = 511 decimal, overflows u8.
3063        // Per ISO 32000-1:2008 §7.3.4.2: "high-order overflow shall be ignored"
3064        // 511 as u8 = 255 (0x1FF truncated to 0xFF)
3065        let mut tokenizer = ContentTokenizer::new(b"(\\777)");
3066        let token = tokenizer.next_token().unwrap().unwrap();
3067        match token {
3068            Token::String(data) => assert_eq!(data, vec![0xFF]),
3069            _ => panic!("Expected string token"),
3070        }
3071    }
3072
3073    #[test]
3074    fn test_octal_escape_overflow_400() {
3075        // \400 = octal 400 = 256 decimal, just overflows u8.
3076        // 256 as u8 = 0
3077        let mut tokenizer = ContentTokenizer::new(b"(\\400)");
3078        let token = tokenizer.next_token().unwrap().unwrap();
3079        match token {
3080            Token::String(data) => assert_eq!(data, vec![0x00]),
3081            _ => panic!("Expected string token"),
3082        }
3083    }
3084
3085    #[test]
3086    fn test_octal_escape_overflow_577() {
3087        // \577 = octal 577 = 383 decimal.
3088        // 383 as u8 = 127 (0x17F truncated to 0x7F)
3089        let mut tokenizer = ContentTokenizer::new(b"(\\577)");
3090        let token = tokenizer.next_token().unwrap().unwrap();
3091        match token {
3092            Token::String(data) => assert_eq!(data, vec![0x7F]),
3093            _ => panic!("Expected string token"),
3094        }
3095    }
3096
3097    #[test]
3098    fn test_octal_escape_max_valid_377() {
3099        // \377 = 255, max valid octal for u8 - should still work correctly
3100        let mut tokenizer = ContentTokenizer::new(b"(\\377)");
3101        let token = tokenizer.next_token().unwrap().unwrap();
3102        match token {
3103            Token::String(data) => assert_eq!(data, vec![0xFF]),
3104            _ => panic!("Expected string token"),
3105        }
3106    }
3107
3108    #[test]
3109    fn test_octal_escape_overflow_mixed_with_valid() {
3110        // Mix of overflow octal and normal text
3111        let mut tokenizer = ContentTokenizer::new(b"(A\\777B\\101C)");
3112        let token = tokenizer.next_token().unwrap().unwrap();
3113        match token {
3114            Token::String(data) => {
3115                assert_eq!(data, vec![b'A', 0xFF, b'B', b'A', b'C']);
3116            }
3117            _ => panic!("Expected string token"),
3118        }
3119    }
3120}