hayro_syntax/content/
mod.rs

1/*!
2PDF content operators.
3
4This module provides facilities to read and interpret PDF content streams using
5high-level types.
6
7```
8use hayro_syntax::object::Number;
9use hayro_syntax::content::*;
10use hayro_syntax::content::ops::*;
11
12let content_stream = b"1 0 0 -1 0 200 cm
130 1.0 0 rg
140 0 m
15200 0 l
16200 200 l
170 200 l
18h
19f";
20
21let mut iter = TypedIter::new(content_stream);
22assert!(matches!(iter.next(), Some(TypedInstruction::Transform(_))));
23assert!(matches!(iter.next(), Some(TypedInstruction::NonStrokeColorDeviceRgb(_))));
24assert!(matches!(iter.next(), Some(TypedInstruction::MoveTo(_))));
25assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
26assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
27assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
28assert!(matches!(iter.next(), Some(TypedInstruction::ClosePath(_))));
29assert!(matches!(iter.next(), Some(TypedInstruction::FillPathNonZero(_))));
30```
31*/
32
33#[allow(missing_docs)]
34pub mod ops;
35
36use crate::content::ops::TypedInstruction;
37use crate::object::Stream;
38use crate::object::dict::InlineImageDict;
39use crate::object::name::{Name, skip_name_like};
40use crate::object::{Object, ObjectLike};
41use crate::reader::{Readable, Reader, ReaderContext, Skippable};
42use log::warn;
43use smallvec::SmallVec;
44use std::fmt::{Debug, Formatter};
45use std::ops::Deref;
46
47// 6 operands are used for example for ctm or cubic curves,
48// but anything above should be pretty rare (only for example for
49// DeviceN color spaces)
50const OPERANDS_THRESHOLD: usize = 6;
51
52impl Debug for Operator<'_> {
53    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
54        write!(f, "{}", self.0.as_str())
55    }
56}
57
58/// A content stream operator.
59#[derive(Clone, PartialEq)]
60pub struct Operator<'a>(Name<'a>);
61
62impl Deref for Operator<'_> {
63    type Target = [u8];
64
65    fn deref(&self) -> &Self::Target {
66        self.0.as_ref()
67    }
68}
69
70impl Skippable for Operator<'_> {
71    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
72        skip_name_like(r, false).map(|_| ())
73    }
74}
75
76impl<'a> Readable<'a> for Operator<'a> {
77    fn read(r: &mut Reader<'a>, _: &ReaderContext) -> Option<Self> {
78        let data = {
79            let start = r.offset();
80            skip_name_like(r, false)?;
81            let end = r.offset();
82            let data = r.range(start..end).unwrap();
83
84            if data.is_empty() {
85                return None;
86            }
87
88            data
89        };
90
91        Some(Operator(Name::from_unescaped(data)))
92    }
93}
94
95/// An iterator over operators in the PDF content streams, providing raw access to the instructions.
96#[derive(Clone)]
97pub struct UntypedIter<'a> {
98    reader: Reader<'a>,
99    stack: Stack<'a>,
100}
101
102impl<'a> UntypedIter<'a> {
103    /// Create a new untyped iterator.
104    pub fn new(data: &'a [u8]) -> UntypedIter<'a> {
105        Self {
106            reader: Reader::new(data),
107            stack: Stack::new(),
108        }
109    }
110
111    /// Create a new empty untyped iterator.
112    pub fn empty() -> UntypedIter<'a> {
113        Self {
114            reader: Reader::new(&[]),
115            stack: Stack::new(),
116        }
117    }
118}
119
120impl<'a> Iterator for UntypedIter<'a> {
121    type Item = Instruction<'a>;
122
123    fn next(&mut self) -> Option<Self::Item> {
124        self.stack.clear();
125
126        self.reader.skip_white_spaces_and_comments();
127
128        while !self.reader.at_end() {
129            // I believe booleans/null never appear as an operator?
130            if matches!(
131                self.reader.peek_byte()?,
132                b'/' | b'.' | b'+' | b'-' | b'0'..=b'9' | b'[' | b'<' | b'('
133            ) {
134                self.stack
135                    .push(self.reader.read_without_context::<Object>()?);
136            } else {
137                let operator = match self.reader.read_without_context::<Operator>() {
138                    Some(o) => o,
139                    None => {
140                        warn!("failed to read operator in content stream");
141
142                        self.reader.jump_to_end();
143                        return None;
144                    }
145                };
146
147                // Inline images need special casing...
148                if operator.as_ref() == b"BI" {
149                    // The ID operator will already be consumed by this.
150                    let inline_dict = self.reader.read_without_context::<InlineImageDict>()?;
151                    let dict = inline_dict.get_dict().clone();
152
153                    // One whitespace after "ID".
154                    self.reader.read_white_space()?;
155
156                    let stream_data = self.reader.tail()?;
157                    let start_offset = self.reader.offset();
158
159                    'outer: while let Some(bytes) = self.reader.peek_bytes(2) {
160                        if bytes == b"EI" {
161                            let end_offset = self.reader.offset() - start_offset;
162                            let image_data = &stream_data[..end_offset];
163
164                            let stream = Stream::from_raw(image_data, dict.clone());
165
166                            // Note that there is a possibility that the encoded stream data
167                            // contains the "EI" operator as part of the data, in which case we
168                            // cannot confidently know whether we have hit the actual end of the
169                            // stream. See also <https://github.com/pdf-association/pdf-issues/issues/543>
170                            // PDF 2.0 does have a `/Length` attribute we can read, but since it's relatively
171                            // new we don't bother trying to read it.
172                            let tail = &self.reader.tail()?[2..];
173                            let mut find_reader = Reader::new(tail);
174
175                            while let Some(bytes) = find_reader.peek_bytes(2) {
176                                if bytes == b"EI" {
177                                    let analyze_data = &tail;
178
179                                    // If there is any binary data in-between, we for sure
180                                    // have not reached the end.
181                                    if analyze_data.iter().any(|c| !c.is_ascii()) {
182                                        self.reader.read_bytes(2)?;
183                                        continue 'outer;
184                                    }
185
186                                    // Otherwise, the only possibility that we reached an
187                                    // "EI", even though the previous one was valid, is
188                                    // that it's part of a string in the content
189                                    // stream that follows the inline image. Therefore,
190                                    // it should be valid to interpret `tail` as a content
191                                    // stream and there should be at least one text-related
192                                    // operator that can be parsed correctly.
193
194                                    let iter = TypedIter::new(tail);
195                                    let mut found = false;
196
197                                    for (counter, op) in iter.enumerate() {
198                                        // If we have read more than 20 valid operators, it should be
199                                        // safe to assume that we are in a content stream, so abort
200                                        // early. The only situation where this could reasonably
201                                        // be violated is if we have 20 subsequent instances of
202                                        // q/Q in the image data, which seems very unlikely.
203                                        if counter >= 20 {
204                                            found = true;
205                                            break;
206                                        }
207
208                                        if matches!(
209                                            op,
210                                            TypedInstruction::NextLineAndShowText(_)
211                                                | TypedInstruction::ShowText(_)
212                                                | TypedInstruction::ShowTexts(_)
213                                                | TypedInstruction::ShowTextWithParameters(_)
214                                        ) {
215                                            // Now it should be safe to assume that the
216                                            // previous `EI` was the correct one.
217                                            found = true;
218                                            break;
219                                        }
220                                    }
221
222                                    if !found {
223                                        // Seems like the data in-between is not a valid content
224                                        // stream, so we are likely still within the image data.
225                                        self.reader.read_bytes(2)?;
226                                        continue 'outer;
227                                    }
228                                } else if bytes == b"BI" {
229                                    // Possibly another inline image, if so, the previously found "EI"
230                                    // is indeed the end of data.
231                                    let mut cloned = find_reader.clone();
232                                    cloned.read_bytes(2)?;
233                                    if cloned.read_without_context::<InlineImageDict>().is_some() {
234                                        break;
235                                    }
236                                }
237
238                                find_reader.read_byte()?;
239                            }
240
241                            self.stack.push(Object::Stream(stream));
242
243                            self.reader.read_bytes(2)?;
244                            self.reader.skip_white_spaces();
245
246                            break;
247                        } else {
248                            self.reader.read_byte()?;
249                        }
250                    }
251                }
252
253                return Some(Instruction {
254                    operands: self.stack.clone(),
255                    operator,
256                });
257            }
258
259            self.reader.skip_white_spaces_and_comments();
260        }
261
262        None
263    }
264}
265
266/// An iterator over PDF content streams that provide access to the instructions
267/// in a typed fashion.
268#[derive(Clone)]
269pub struct TypedIter<'a> {
270    untyped: UntypedIter<'a>,
271}
272
273impl<'a> TypedIter<'a> {
274    /// Create a new typed iterator.
275    pub fn new(data: &'a [u8]) -> TypedIter<'a> {
276        Self {
277            untyped: UntypedIter::new(data),
278        }
279    }
280
281    pub(crate) fn from_untyped(untyped: UntypedIter<'a>) -> TypedIter<'a> {
282        Self { untyped }
283    }
284}
285
286impl<'a> Iterator for TypedIter<'a> {
287    type Item = TypedInstruction<'a>;
288
289    fn next(&mut self) -> Option<Self::Item> {
290        self.untyped
291            .next()
292            .and_then(|op| TypedInstruction::dispatch(&op))
293    }
294}
295
296/// An instruction (= operator and its operands) in a content stream.
297pub struct Instruction<'a> {
298    /// The stack containing the operands.
299    pub operands: Stack<'a>,
300    /// The actual operator.
301    pub operator: Operator<'a>,
302}
303
304impl<'a> Instruction<'a> {
305    /// An iterator over the operands of the instruction.
306    pub fn operands(self) -> OperandIterator<'a> {
307        OperandIterator::new(self.operands)
308    }
309}
310
311/// A stack holding the arguments of an operator.
312#[derive(Debug, Clone, PartialEq)]
313pub struct Stack<'a>(SmallVec<[Object<'a>; OPERANDS_THRESHOLD]>);
314
315impl<'a> Default for Stack<'a> {
316    fn default() -> Self {
317        Self::new()
318    }
319}
320
321impl<'a> Stack<'a> {
322    /// Create a new, empty stack.
323    pub fn new() -> Self {
324        Self(SmallVec::new())
325    }
326
327    fn push(&mut self, operand: Object<'a>) {
328        self.0.push(operand);
329    }
330
331    fn clear(&mut self) {
332        self.0.clear();
333    }
334
335    fn len(&self) -> usize {
336        self.0.len()
337    }
338
339    fn get<T>(&self, index: usize) -> Option<T>
340    where
341        T: ObjectLike<'a>,
342    {
343        self.0.get(index).and_then(|e| e.clone().cast::<T>())
344    }
345
346    fn get_all<T>(&self) -> Option<SmallVec<[T; OPERANDS_THRESHOLD]>>
347    where
348        T: ObjectLike<'a>,
349    {
350        let mut operands = SmallVec::new();
351
352        for op in &self.0 {
353            let converted = op.clone().cast::<T>()?;
354            operands.push(converted);
355        }
356
357        Some(operands)
358    }
359}
360
361/// An iterator over the operands of an operator.
362pub struct OperandIterator<'a> {
363    stack: Stack<'a>,
364    cur_index: usize,
365}
366
367impl<'a> OperandIterator<'a> {
368    fn new(stack: Stack<'a>) -> Self {
369        Self {
370            stack,
371            cur_index: 0,
372        }
373    }
374}
375
376impl<'a> Iterator for OperandIterator<'a> {
377    type Item = Object<'a>;
378
379    fn next(&mut self) -> Option<Self::Item> {
380        if let Some(item) = self.stack.get::<Object<'a>>(self.cur_index) {
381            self.cur_index += 1;
382
383            Some(item)
384        } else {
385            None
386        }
387    }
388}
389
390pub(crate) trait OperatorTrait<'a>
391where
392    Self: Sized + Into<TypedInstruction<'a>> + TryFrom<TypedInstruction<'a>>,
393{
394    const OPERATOR: &'static str;
395
396    fn from_stack(stack: &Stack<'a>) -> Option<Self>;
397}
398
399mod macros {
400    macro_rules! op_impl {
401        ($t:ident $(<$l:lifetime>),*, $e:expr, $n:expr, $body:expr) => {
402            impl<'a> OperatorTrait<'a> for $t$(<$l>),* {
403                const OPERATOR: &'static str = $e;
404
405                fn from_stack(stack: &Stack<'a>) -> Option<Self> {
406                    if $n != u8::MAX as usize {
407                        if stack.len() != $n {
408                            warn!("wrong stack length {} for operator {}, expected {}", stack.len(), Self::OPERATOR, $n);
409                        }
410                    }
411
412                    $body(stack).or_else(|| {
413                        warn!("failed to convert operands for operator {}", Self::OPERATOR);
414
415                        None
416                    })
417                }
418            }
419
420            impl<'a> From<$t$(<$l>),*> for TypedInstruction<'a> {
421                fn from(value: $t$(<$l>),*) -> Self {
422                    TypedInstruction::$t(value)
423                }
424            }
425
426            impl<'a> TryFrom<TypedInstruction<'a>> for $t$(<$l>),* {
427                type Error = ();
428
429                fn try_from(value: TypedInstruction<'a>) -> std::result::Result<Self, Self::Error> {
430                    match value {
431                        TypedInstruction::$t(e) => Ok(e),
432                        _ => Err(())
433                    }
434                }
435            }
436        };
437    }
438
439    // The `shift` parameter will always be 0 in valid PDFs. The purpose of the parameter is
440    // so that in case there are garbage operands in the content stream, we prefer to use
441    // the operands that are closer to the operator instead of the values at the bottom
442    // of the stack.
443
444    macro_rules! op0 {
445        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
446            crate::content::macros::op_impl!($t$(<$l>),*, $e, 0, |_| Some(Self));
447        }
448    }
449
450    macro_rules! op1 {
451        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
452            crate::content::macros::op_impl!($t$(<$l>),*, $e, 1, |stack: &Stack<'a>| {
453                let shift = stack.len().saturating_sub(1);
454                Some(Self(stack.get(0 + shift)?))
455            });
456        }
457    }
458
459    macro_rules! op_all {
460        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
461            crate::content::macros::op_impl!($t$(<$l>),*, $e, u8::MAX as usize, |stack: &Stack<'a>|
462            Some(Self(stack.get_all()?)));
463        }
464    }
465
466    macro_rules! op2 {
467        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
468            crate::content::macros::op_impl!($t$(<$l>),*, $e, 2, |stack: &Stack<'a>| {
469                let shift = stack.len().saturating_sub(2);
470                Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?))
471            });
472        }
473    }
474
475    macro_rules! op3 {
476        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
477            crate::content::macros::op_impl!($t$(<$l>),*, $e, 3, |stack: &Stack<'a>| {
478                let shift = stack.len().saturating_sub(3);
479                Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?,
480                stack.get(2 + shift)?))
481            });
482        }
483    }
484
485    macro_rules! op4 {
486        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
487            crate::content::macros::op_impl!($t$(<$l>),*, $e, 4, |stack: &Stack<'a>| {
488               let shift = stack.len().saturating_sub(4);
489            Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?,
490            stack.get(2 + shift)?, stack.get(3 + shift)?))
491            });
492        }
493    }
494
495    macro_rules! op6 {
496        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
497            crate::content::macros::op_impl!($t$(<$l>),*, $e, 6, |stack: &Stack<'a>| {
498                let shift = stack.len().saturating_sub(6);
499            Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?,
500            stack.get(2 + shift)?, stack.get(3 + shift)?,
501            stack.get(4 + shift)?, stack.get(5 + shift)?))
502            });
503        }
504    }
505
506    pub(crate) use op_all;
507    pub(crate) use op_impl;
508    pub(crate) use op0;
509    pub(crate) use op1;
510    pub(crate) use op2;
511    pub(crate) use op3;
512    pub(crate) use op4;
513    pub(crate) use op6;
514}