Skip to main content

hayro_syntax/content/
mod.rs

1/*!
2PDF content operators.
3
4This module provides facilities to read and interpret PDF content streams using
5high-level types.
6
7```
8use hayro_syntax::object::Number;
9use hayro_syntax::content::*;
10use hayro_syntax::content::ops::*;
11
12let content_stream = b"1 0 0 -1 0 200 cm
130 1.0 0 rg
140 0 m
15200 0 l
16200 200 l
170 200 l
18h
19f";
20
21let mut iter = TypedIter::new(content_stream);
22assert!(matches!(iter.next(), Some(TypedInstruction::Transform(_))));
23assert!(matches!(iter.next(), Some(TypedInstruction::NonStrokeColorDeviceRgb(_))));
24assert!(matches!(iter.next(), Some(TypedInstruction::MoveTo(_))));
25assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
26assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
27assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
28assert!(matches!(iter.next(), Some(TypedInstruction::ClosePath(_))));
29assert!(matches!(iter.next(), Some(TypedInstruction::FillPathNonZero(_))));
30```
31*/
32
33#[allow(missing_docs)]
34pub mod ops;
35
36use crate::content::ops::TypedInstruction;
37use crate::object;
38use crate::object::dict::InlineImageDict;
39use crate::object::name::{Name, skip_name_like};
40use crate::object::{Array, Null, Number, Object, Stream};
41use crate::reader::Reader;
42use crate::reader::{Readable, ReaderContext, ReaderExt, Skippable};
43use crate::trivia::is_white_space_character;
44use crate::util::find_needle;
45use core::array;
46use core::fmt::{Debug, Formatter};
47use core::ops::Deref;
48use smallvec::SmallVec;
49
50// 6 operands are used for example for ctm or cubic curves,
51// but anything above should be pretty rare (only for example for
52// DeviceN color spaces, or invalid PDF files). So we settle on 10.
53const OPERANDS_THRESHOLD: usize = 10;
54
55impl Debug for Operator<'_> {
56    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
57        write!(f, "{}", self.0.as_str())
58    }
59}
60
61/// A content stream operator.
62#[derive(Clone, PartialEq)]
63pub struct Operator<'a>(Name<'a>);
64
65impl Deref for Operator<'_> {
66    type Target = [u8];
67
68    fn deref(&self) -> &Self::Target {
69        self.0.as_ref()
70    }
71}
72
73impl Skippable for Operator<'_> {
74    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
75        skip_name_like(r, false).map(|_| ())
76    }
77}
78
79impl<'a> Readable<'a> for Operator<'a> {
80    fn read(r: &mut Reader<'a>, _: &ReaderContext<'a>) -> Option<Self> {
81        let start = r.offset();
82        skip_name_like(r, false)?;
83        let end = r.offset();
84        let data = r.range(start..end)?;
85
86        if data.is_empty() {
87            return None;
88        }
89
90        Some(Self(Name::new(data)?))
91    }
92}
93
94/// An iterator over operators in the PDF content streams, providing raw access to the instructions.
95#[derive(Clone)]
96pub struct UntypedIter<'a> {
97    reader: Reader<'a>,
98    stack: Stack<'a>,
99    operator: Option<Operator<'a>>,
100}
101
102impl<'a> UntypedIter<'a> {
103    /// Create a new untyped iterator.
104    pub fn new(data: &'a [u8]) -> Self {
105        Self {
106            reader: Reader::new(data),
107            stack: Stack::new(),
108            operator: None,
109        }
110    }
111
112    /// Create a new empty untyped iterator.
113    pub fn empty() -> Self {
114        Self {
115            reader: Reader::new(&[]),
116            stack: Stack::new(),
117            operator: None,
118        }
119    }
120
121    /// Return the next instruction.
122    #[allow(clippy::should_implement_trait)]
123    pub fn next(&mut self) -> Option<Instruction<'_, 'a>> {
124        self.stack.clear();
125        self.operator = None;
126
127        self.reader.skip_white_spaces_and_comments();
128
129        while !self.reader.at_end() {
130            // I believe booleans/null never appear as an operator?
131            if matches!(
132                self.reader.peek_byte()?,
133                b'/' | b'.' | b'+' | b'-' | b'0'..=b'9' | b'[' | b'<' | b'('
134            ) {
135                // See issue 994. In all sane scenarios, if the next byte is a number
136                // it has to be an operand (a number). However, it's possible that
137                // the number is followed by a regular character, in which case it
138                // should behave more like an operator (even though there exists
139                // no operator that starts with a number). In order to preserve
140                // similar behavior to Acrobat and Chromium, we try to consume
141                // such an operator and then simply skip it.
142                if let Some(object) = self.reader.read_without_context::<Object<'_>>() {
143                    self.stack.push(object)?;
144                } else if self.reader.read_without_context::<Operator<'_>>().is_some() {
145                    self.stack.clear();
146                } else {
147                    return None;
148                }
149            } else {
150                let operator = match self.reader.read_without_context::<Operator<'_>>() {
151                    Some(o) => o,
152                    None => {
153                        warn!("failed to read operator in content stream");
154
155                        self.reader.jump_to_end();
156                        return None;
157                    }
158                };
159
160                // Inline images need special casing...
161                if operator.as_ref() == b"BI" {
162                    // The ID operator will already be consumed by this.
163                    let inline_dict = self.reader.read_without_context::<InlineImageDict<'_>>()?;
164                    let dict = inline_dict.get_dict().clone();
165
166                    // One whitespace after "ID".
167                    self.reader.read_white_space()?;
168
169                    let stream_data = self.reader.tail()?;
170                    let start_offset = self.reader.offset();
171
172                    'outer: while let Some(pos) = find_needle(self.reader.tail()?, b"EI") {
173                        self.reader.read_bytes(pos)?;
174
175                        if self.reader.peek_bytes(2) == Some(b"EI") {
176                            // If the following character is not a whitespace character, then we are in a ASCII-85 stream.
177                            if self
178                                .reader
179                                .peek_bytes(3)
180                                .is_some_and(|b| !is_white_space_character(b[2]))
181                            {
182                                self.reader.read_bytes(3)?;
183
184                                continue;
185                            }
186
187                            let end_offset = self.reader.offset() - start_offset;
188                            let image_data = &stream_data[..end_offset];
189
190                            let stream = Stream::new(image_data, dict.clone());
191
192                            // Note that there is a possibility that the encoded stream data
193                            // contains the "EI" operator as part of the data, in which case we
194                            // cannot confidently know whether we have hit the actual end of the
195                            // stream. See also <https://github.com/pdf-association/pdf-issues/issues/543>
196                            // PDF 2.0 does have a `/Length` attribute we can read, but since it's relatively
197                            // new we don't bother trying to read it.
198                            let tail = &self.reader.tail()?[2..];
199                            let mut find_reader = Reader::new(tail);
200
201                            while !find_reader.at_end() {
202                                let remaining = find_reader.tail()?;
203                                let next_ei = find_needle(remaining, b"EI");
204                                let next_bi = find_needle(remaining, b"BI");
205
206                                let (next_pos, is_ei) = match (next_ei, next_bi) {
207                                    (Some(ei), Some(bi)) if ei <= bi => (ei, true),
208                                    (Some(_), Some(bi)) => (bi, false),
209                                    (Some(ei), None) => (ei, true),
210                                    (None, Some(bi)) => (bi, false),
211                                    (None, None) => break,
212                                };
213
214                                find_reader.read_bytes(next_pos)?;
215
216                                if is_ei {
217                                    let analyze_data = &tail[..find_reader.offset()];
218
219                                    // If there is any binary data in-between, we for sure
220                                    // have not reached the end.
221                                    if analyze_data.iter().any(|c| !c.is_ascii()) {
222                                        self.reader.read_bytes(2)?;
223                                        continue 'outer;
224                                    }
225
226                                    // Otherwise, the only possibility that we reached an
227                                    // "EI", even though the previous one was valid, is
228                                    // that it's part of a string in the content
229                                    // stream that follows the inline image. Therefore,
230                                    // it should be valid to interpret `tail` as a content
231                                    // stream and there should be at least one text-related
232                                    // operator that can be parsed correctly.
233
234                                    let mut iter = TypedIter::new(tail);
235                                    let mut found = false;
236                                    let mut counter = 0;
237
238                                    while let Some(op) = iter.next() {
239                                        // If we have read more than 20 valid operators, it should be
240                                        // safe to assume that we are in a content stream, so abort
241                                        // early. The only situation where this could reasonably
242                                        // be violated is if we have 20 subsequent instances of
243                                        // q/Q in the image data, which seems very unlikely.
244                                        if counter >= 20 {
245                                            found = true;
246                                            break;
247                                        }
248
249                                        if matches!(
250                                            op,
251                                            TypedInstruction::NextLineAndShowText(_)
252                                                | TypedInstruction::ShowText(_)
253                                                | TypedInstruction::ShowTexts(_)
254                                                | TypedInstruction::ShowTextWithParameters(_)
255                                        ) {
256                                            // Now it should be safe to assume that the
257                                            // previous `EI` was the correct one.
258                                            found = true;
259                                            break;
260                                        }
261
262                                        counter += 1;
263                                    }
264
265                                    if !found {
266                                        // Seems like the data in-between is not a valid content
267                                        // stream, so we are likely still within the image data.
268                                        self.reader.read_bytes(2)?;
269                                        continue 'outer;
270                                    }
271                                } else {
272                                    // Possibly another inline image, if so, the previously found "EI"
273                                    // is indeed the end of data.
274                                    let mut cloned = find_reader.clone();
275                                    cloned.read_bytes(2)?;
276                                    if cloned
277                                        .read_without_context::<InlineImageDict<'_>>()
278                                        .is_some()
279                                    {
280                                        break;
281                                    }
282                                }
283
284                                find_reader.read_byte()?;
285                            }
286
287                            self.stack.push(Object::Stream(stream))?;
288
289                            self.reader.read_bytes(2)?;
290                            self.reader.skip_white_spaces();
291
292                            break;
293                        }
294                    }
295                }
296
297                self.operator = Some(operator);
298                return Some(Instruction {
299                    operands: &self.stack,
300                    operator: self.operator.as_ref().unwrap(),
301                });
302            }
303
304            self.reader.skip_white_spaces_and_comments();
305        }
306
307        None
308    }
309}
310
311/// An iterator over PDF content streams that provide access to the instructions
312/// in a typed fashion.
313#[derive(Clone)]
314pub struct TypedIter<'a> {
315    untyped: UntypedIter<'a>,
316}
317
318impl<'a> TypedIter<'a> {
319    /// Create a new typed iterator.
320    pub fn new(data: &'a [u8]) -> Self {
321        Self {
322            untyped: UntypedIter::new(data),
323        }
324    }
325
326    pub(crate) fn from_untyped(untyped: UntypedIter<'a>) -> Self {
327        Self { untyped }
328    }
329
330    /// Return the next typed instruction.
331    #[allow(clippy::should_implement_trait)]
332    pub fn next(&mut self) -> Option<TypedInstruction<'_, 'a>> {
333        let op = self.untyped.next()?;
334        // TODO: Explore whether dispatching can be made more efficient.
335        match TypedInstruction::dispatch(&op) {
336            Some(op) => Some(op),
337            // In case this returns `None`, the content stream is invalid. In case a path-drawing
338            // operator was used, let's abort completely, otherwise we might end up drawing random stuff.
339            // However, for other operators it could be worth it to just skip it but keep attempting
340            // to read other content operators.
341            None => {
342                if [
343                    &b"m"[..],
344                    &b"l"[..],
345                    &b"c"[..],
346                    &b"v"[..],
347                    &b"y"[..],
348                    &b"h"[..],
349                    &b"re"[..],
350                ]
351                .contains(&op.operator.0.deref())
352                {
353                    None
354                } else {
355                    Some(TypedInstruction::Fallback(op.operator))
356                }
357            }
358        }
359    }
360}
361
362/// An instruction (= operator and its operands) in a content stream.
363pub struct Instruction<'b, 'a> {
364    /// The stack containing the operands.
365    pub operands: &'b Stack<'a>,
366    /// The actual operator.
367    pub operator: &'b Operator<'a>,
368}
369
370impl<'b, 'a> Instruction<'b, 'a> {
371    /// An iterator over the operands of the instruction.
372    pub fn operands(&self) -> OperandIterator<'b, 'a> {
373        OperandIterator::new(self.operands)
374    }
375}
376
377/// A stack holding the arguments of an operator.
378pub struct Stack<'a> {
379    // TODO: Explore using an object pool to avoid repeatedly
380    // allocating/deallocating objects.
381    data: [Object<'a>; OPERANDS_THRESHOLD],
382    len: usize,
383}
384
385impl<'a> Default for Stack<'a> {
386    fn default() -> Self {
387        Self::new()
388    }
389}
390
391impl<'a> Stack<'a> {
392    /// Create a new, empty stack.
393    pub fn new() -> Self {
394        Self {
395            data: array::from_fn(|_| Object::Null(Null)),
396            len: 0,
397        }
398    }
399
400    fn push(&mut self, operand: Object<'a>) -> Option<()> {
401        if self.len >= OPERANDS_THRESHOLD {
402            return None;
403        }
404
405        self.data[self.len] = operand;
406        self.len += 1;
407        Some(())
408    }
409
410    fn clear(&mut self) {
411        self.len = 0;
412    }
413
414    fn len(&self) -> usize {
415        self.len
416    }
417
418    fn as_slice(&self) -> &[Object<'a>] {
419        &self.data[..self.len]
420    }
421
422    fn get<'b, T>(&'b self, index: usize) -> Option<T>
423    where
424        T: Operand<'b, 'a>,
425    {
426        self.as_slice().get(index).and_then(T::from_object)
427    }
428
429    fn get_all<'b, T>(&'b self) -> Option<SmallVec<[T; OPERANDS_THRESHOLD]>>
430    where
431        T: Operand<'b, 'a>,
432    {
433        let mut operands = SmallVec::new();
434
435        for op in self.as_slice() {
436            let converted = T::from_object(op)?;
437            operands.push(converted);
438        }
439
440        Some(operands)
441    }
442}
443
444impl Debug for Stack<'_> {
445    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
446        f.debug_list().entries(self.as_slice()).finish()
447    }
448}
449
450impl Clone for Stack<'_> {
451    fn clone(&self) -> Self {
452        let mut stack = Self::new();
453        for item in self.as_slice() {
454            stack.push(item.clone()).unwrap();
455        }
456        stack
457    }
458}
459
460impl PartialEq for Stack<'_> {
461    fn eq(&self, other: &Self) -> bool {
462        self.as_slice() == other.as_slice()
463    }
464}
465
466trait Operand<'b, 'a>: Sized {
467    fn from_object(object: &'b Object<'a>) -> Option<Self>;
468}
469
470impl<'b, 'a> Operand<'b, 'a> for Number {
471    fn from_object(object: &'b Object<'a>) -> Option<Self> {
472        match object {
473            Object::Number(n) => Some(*n),
474            _ => None,
475        }
476    }
477}
478
479impl<'b, 'a> Operand<'b, 'a> for &'b object::String<'a> {
480    fn from_object(object: &'b Object<'a>) -> Option<Self> {
481        match object {
482            Object::String(s) => Some(s),
483            _ => None,
484        }
485    }
486}
487
488impl<'b, 'a> Operand<'b, 'a> for &'b Name<'a> {
489    fn from_object(object: &'b Object<'a>) -> Option<Self> {
490        match object {
491            Object::Name(n) => Some(n),
492            _ => None,
493        }
494    }
495}
496
497impl<'b, 'a> Operand<'b, 'a> for &'b Array<'a> {
498    fn from_object(object: &'b Object<'a>) -> Option<Self> {
499        match object {
500            Object::Array(a) => Some(a),
501            _ => None,
502        }
503    }
504}
505
506impl<'b, 'a> Operand<'b, 'a> for &'b Stream<'a> {
507    fn from_object(object: &'b Object<'a>) -> Option<Self> {
508        match object {
509            Object::Stream(s) => Some(s),
510            _ => None,
511        }
512    }
513}
514
515impl<'b, 'a> Operand<'b, 'a> for &'b Object<'a> {
516    fn from_object(object: &'b Object<'a>) -> Option<Self> {
517        Some(object)
518    }
519}
520
521/// An iterator over the operands of an operator.
522pub struct OperandIterator<'b, 'a> {
523    stack: &'b Stack<'a>,
524    cur_index: usize,
525}
526
527impl<'b, 'a> OperandIterator<'b, 'a> {
528    fn new(stack: &'b Stack<'a>) -> Self {
529        Self {
530            stack,
531            cur_index: 0,
532        }
533    }
534}
535
536impl<'b, 'a> Iterator for OperandIterator<'b, 'a> {
537    type Item = &'b Object<'a>;
538
539    fn next(&mut self) -> Option<Self::Item> {
540        if let Some(item) = self.stack.as_slice().get(self.cur_index) {
541            self.cur_index += 1;
542
543            Some(item)
544        } else {
545            None
546        }
547    }
548}
549
550pub(crate) trait OperatorTrait<'b, 'a>: Sized {
551    const OPERATOR: &'static str;
552
553    fn from_stack(stack: &'b Stack<'a>) -> Option<Self>;
554}
555
556mod macros {
557    macro_rules! op_impl {
558        ($t:ident $(<$($l:lifetime),+>)?, $e:expr, $n:expr, |$stack:ident : $stack_ty:ty| $body:block) => {
559            impl<'b, 'a> OperatorTrait<'b, 'a> for $t$(<$($l),+>)? {
560                const OPERATOR: &'static str = $e;
561
562                #[inline(always)]
563                fn from_stack($stack: $stack_ty) -> Option<Self> {
564                    $body.or_else(|| {
565                        warn!("failed to convert operands for operator {}", Self::OPERATOR);
566
567                        None
568                    })
569                }
570            }
571
572            impl<'b, 'a> From<$t$(<$($l),+>)?> for TypedInstruction<'b, 'a> {
573                fn from(value: $t$(<$($l),+>)?) -> Self {
574                    TypedInstruction::$t(value)
575                }
576            }
577
578            impl<'b, 'a> TryFrom<TypedInstruction<'b, 'a>> for $t$(<$($l),+>)? {
579                type Error = ();
580
581                fn try_from(value: TypedInstruction<'b, 'a>) -> core::result::Result<Self, Self::Error> {
582                    match value {
583                        TypedInstruction::$t(e) => Ok(e),
584                        _ => Err(())
585                    }
586                }
587            }
588        };
589    }
590
591    // The `shift` parameter will always be 0 in valid PDFs. The purpose of the parameter is
592    // so that in case there are garbage operands in the content stream, we prefer to use
593    // the operands that are closer to the operator instead of the values at the bottom
594    // of the stack.
595
596    macro_rules! op0 {
597        ($t:ident $(<$($l:lifetime),+>)?, $e:expr) => {
598            crate::content::macros::op_impl!($t$(<$($l),+>)?, $e, 0, |_stack: &'b Stack<'a>| {
599                Some(Self)
600            });
601        }
602    }
603
604    macro_rules! op1 {
605        ($t:ident $(<$($l:lifetime),+>)?, $e:expr) => {
606            crate::content::macros::op_impl!($t$(<$($l),+>)?, $e, 1, |stack: &'b Stack<'a>| {
607                let shift = stack.len().saturating_sub(1);
608                Some(Self(stack.get(0 + shift)?))
609            });
610        }
611    }
612
613    macro_rules! op_all {
614        ($t:ident $(<$($l:lifetime),+>)?, $e:expr) => {
615            crate::content::macros::op_impl!($t$(<$($l),+>)?, $e, u8::MAX as usize, |stack: &'b Stack<'a>| {
616                Some(Self(stack.get_all()?))
617            });
618        }
619    }
620
621    macro_rules! op2 {
622        ($t:ident $(<$($l:lifetime),+>)?, $e:expr) => {
623            crate::content::macros::op_impl!($t$(<$($l),+>)?, $e, 2, |stack: &'b Stack<'a>| {
624                let shift = stack.len().saturating_sub(2);
625                Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?))
626            });
627        }
628    }
629
630    macro_rules! op3 {
631        ($t:ident $(<$($l:lifetime),+>)?, $e:expr) => {
632            crate::content::macros::op_impl!($t$(<$($l),+>)?, $e, 3, |stack: &'b Stack<'a>| {
633                let shift = stack.len().saturating_sub(3);
634                Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?,
635                stack.get(2 + shift)?))
636            });
637        }
638    }
639
640    macro_rules! op4 {
641        ($t:ident $(<$($l:lifetime),+>)?, $e:expr) => {
642            crate::content::macros::op_impl!($t$(<$($l),+>)?, $e, 4, |stack: &'b Stack<'a>| {
643               let shift = stack.len().saturating_sub(4);
644            Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?,
645            stack.get(2 + shift)?, stack.get(3 + shift)?))
646            });
647        }
648    }
649
650    macro_rules! op6 {
651        ($t:ident $(<$($l:lifetime),+>)?, $e:expr) => {
652            crate::content::macros::op_impl!($t$(<$($l),+>)?, $e, 6, |stack: &'b Stack<'a>| {
653                let shift = stack.len().saturating_sub(6);
654            Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?,
655            stack.get(2 + shift)?, stack.get(3 + shift)?,
656            stack.get(4 + shift)?, stack.get(5 + shift)?))
657            });
658        }
659    }
660
661    pub(crate) use op_all;
662    pub(crate) use op_impl;
663    pub(crate) use op0;
664    pub(crate) use op1;
665    pub(crate) use op2;
666    pub(crate) use op3;
667    pub(crate) use op4;
668    pub(crate) use op6;
669}