hayro_syntax/content/
mod.rs

1/*!
2PDF content operators.
3
4This module provides facilities to read and interpret PDF content streams using
5high-level types.
6
7```
8use hayro_syntax::object::Number;
9use hayro_syntax::content::*;
10use hayro_syntax::content::ops::*;
11
12let content_stream = b"1 0 0 -1 0 200 cm
130 1.0 0 rg
140 0 m
15200 0 l
16200 200 l
170 200 l
18h
19f";
20
21let mut iter = TypedIter::new(content_stream);
22assert!(matches!(iter.next(), Some(TypedInstruction::Transform(_))));
23assert!(matches!(iter.next(), Some(TypedInstruction::NonStrokeColorDeviceRgb(_))));
24assert!(matches!(iter.next(), Some(TypedInstruction::MoveTo(_))));
25assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
26assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
27assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
28assert!(matches!(iter.next(), Some(TypedInstruction::ClosePath(_))));
29assert!(matches!(iter.next(), Some(TypedInstruction::FillPathNonZero(_))));
30```
31*/
32
33#[allow(missing_docs)]
34pub mod ops;
35
36use crate::content::ops::TypedInstruction;
37use crate::object::Stream;
38use crate::object::dict::InlineImageDict;
39use crate::object::name::{Name, skip_name_like};
40use crate::object::{Object, ObjectLike};
41use crate::reader::Reader;
42use crate::reader::{Readable, ReaderContext, ReaderExt, Skippable};
43use log::warn;
44use smallvec::SmallVec;
45use std::fmt::{Debug, Formatter};
46use std::ops::Deref;
47
48// 6 operands are used for example for ctm or cubic curves,
49// but anything above should be pretty rare (only for example for
50// DeviceN color spaces)
51const OPERANDS_THRESHOLD: usize = 6;
52
53impl Debug for Operator<'_> {
54    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
55        write!(f, "{}", self.0.as_str())
56    }
57}
58
59/// A content stream operator.
60#[derive(Clone, PartialEq)]
61pub struct Operator<'a>(Name<'a>);
62
63impl Deref for Operator<'_> {
64    type Target = [u8];
65
66    fn deref(&self) -> &Self::Target {
67        self.0.as_ref()
68    }
69}
70
71impl Skippable for Operator<'_> {
72    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
73        skip_name_like(r, false).map(|_| ())
74    }
75}
76
77impl<'a> Readable<'a> for Operator<'a> {
78    fn read(r: &mut Reader<'a>, _: &ReaderContext<'_>) -> Option<Self> {
79        let data = {
80            let start = r.offset();
81            skip_name_like(r, false)?;
82            let end = r.offset();
83            let data = r.range(start..end).unwrap();
84
85            if data.is_empty() {
86                return None;
87            }
88
89            data
90        };
91
92        Some(Operator(Name::from_unescaped(data)))
93    }
94}
95
96/// An iterator over operators in the PDF content streams, providing raw access to the instructions.
97#[derive(Clone)]
98pub struct UntypedIter<'a> {
99    reader: Reader<'a>,
100    stack: Stack<'a>,
101}
102
103impl<'a> UntypedIter<'a> {
104    /// Create a new untyped iterator.
105    pub fn new(data: &'a [u8]) -> Self {
106        Self {
107            reader: Reader::new(data),
108            stack: Stack::new(),
109        }
110    }
111
112    /// Create a new empty untyped iterator.
113    pub fn empty() -> Self {
114        Self {
115            reader: Reader::new(&[]),
116            stack: Stack::new(),
117        }
118    }
119}
120
121impl<'a> Iterator for UntypedIter<'a> {
122    type Item = Instruction<'a>;
123
124    fn next(&mut self) -> Option<Self::Item> {
125        self.stack.clear();
126
127        self.reader.skip_white_spaces_and_comments();
128
129        while !self.reader.at_end() {
130            // I believe booleans/null never appear as an operator?
131            if matches!(
132                self.reader.peek_byte()?,
133                b'/' | b'.' | b'+' | b'-' | b'0'..=b'9' | b'[' | b'<' | b'('
134            ) {
135                self.stack
136                    .push(self.reader.read_without_context::<Object<'_>>()?);
137            } else {
138                let operator = match self.reader.read_without_context::<Operator<'_>>() {
139                    Some(o) => o,
140                    None => {
141                        warn!("failed to read operator in content stream");
142
143                        self.reader.jump_to_end();
144                        return None;
145                    }
146                };
147
148                // Inline images need special casing...
149                if operator.as_ref() == b"BI" {
150                    // The ID operator will already be consumed by this.
151                    let inline_dict = self.reader.read_without_context::<InlineImageDict<'_>>()?;
152                    let dict = inline_dict.get_dict().clone();
153
154                    // One whitespace after "ID".
155                    self.reader.read_white_space()?;
156
157                    let stream_data = self.reader.tail()?;
158                    let start_offset = self.reader.offset();
159
160                    'outer: while let Some(bytes) = self.reader.peek_bytes(2) {
161                        if bytes == b"EI" {
162                            let end_offset = self.reader.offset() - start_offset;
163                            let image_data = &stream_data[..end_offset];
164
165                            let stream = Stream::new(image_data, dict.clone());
166
167                            // Note that there is a possibility that the encoded stream data
168                            // contains the "EI" operator as part of the data, in which case we
169                            // cannot confidently know whether we have hit the actual end of the
170                            // stream. See also <https://github.com/pdf-association/pdf-issues/issues/543>
171                            // PDF 2.0 does have a `/Length` attribute we can read, but since it's relatively
172                            // new we don't bother trying to read it.
173                            let tail = &self.reader.tail()?[2..];
174                            let mut find_reader = Reader::new(tail);
175
176                            while let Some(bytes) = find_reader.peek_bytes(2) {
177                                if bytes == b"EI" {
178                                    let analyze_data = &tail;
179
180                                    // If there is any binary data in-between, we for sure
181                                    // have not reached the end.
182                                    if analyze_data.iter().any(|c| !c.is_ascii()) {
183                                        self.reader.read_bytes(2)?;
184                                        continue 'outer;
185                                    }
186
187                                    // Otherwise, the only possibility that we reached an
188                                    // "EI", even though the previous one was valid, is
189                                    // that it's part of a string in the content
190                                    // stream that follows the inline image. Therefore,
191                                    // it should be valid to interpret `tail` as a content
192                                    // stream and there should be at least one text-related
193                                    // operator that can be parsed correctly.
194
195                                    let iter = TypedIter::new(tail);
196                                    let mut found = false;
197
198                                    for (counter, op) in iter.enumerate() {
199                                        // If we have read more than 20 valid operators, it should be
200                                        // safe to assume that we are in a content stream, so abort
201                                        // early. The only situation where this could reasonably
202                                        // be violated is if we have 20 subsequent instances of
203                                        // q/Q in the image data, which seems very unlikely.
204                                        if counter >= 20 {
205                                            found = true;
206                                            break;
207                                        }
208
209                                        if matches!(
210                                            op,
211                                            TypedInstruction::NextLineAndShowText(_)
212                                                | TypedInstruction::ShowText(_)
213                                                | TypedInstruction::ShowTexts(_)
214                                                | TypedInstruction::ShowTextWithParameters(_)
215                                        ) {
216                                            // Now it should be safe to assume that the
217                                            // previous `EI` was the correct one.
218                                            found = true;
219                                            break;
220                                        }
221                                    }
222
223                                    if !found {
224                                        // Seems like the data in-between is not a valid content
225                                        // stream, so we are likely still within the image data.
226                                        self.reader.read_bytes(2)?;
227                                        continue 'outer;
228                                    }
229                                } else if bytes == b"BI" {
230                                    // Possibly another inline image, if so, the previously found "EI"
231                                    // is indeed the end of data.
232                                    let mut cloned = find_reader.clone();
233                                    cloned.read_bytes(2)?;
234                                    if cloned
235                                        .read_without_context::<InlineImageDict<'_>>()
236                                        .is_some()
237                                    {
238                                        break;
239                                    }
240                                }
241
242                                find_reader.read_byte()?;
243                            }
244
245                            self.stack.push(Object::Stream(stream));
246
247                            self.reader.read_bytes(2)?;
248                            self.reader.skip_white_spaces();
249
250                            break;
251                        } else {
252                            self.reader.read_byte()?;
253                        }
254                    }
255                }
256
257                return Some(Instruction {
258                    operands: self.stack.clone(),
259                    operator,
260                });
261            }
262
263            self.reader.skip_white_spaces_and_comments();
264        }
265
266        None
267    }
268}
269
270/// An iterator over PDF content streams that provide access to the instructions
271/// in a typed fashion.
272#[derive(Clone)]
273pub struct TypedIter<'a> {
274    untyped: UntypedIter<'a>,
275}
276
277impl<'a> TypedIter<'a> {
278    /// Create a new typed iterator.
279    pub fn new(data: &'a [u8]) -> Self {
280        Self {
281            untyped: UntypedIter::new(data),
282        }
283    }
284
285    pub(crate) fn from_untyped(untyped: UntypedIter<'a>) -> Self {
286        Self { untyped }
287    }
288}
289
290impl<'a> Iterator for TypedIter<'a> {
291    type Item = TypedInstruction<'a>;
292
293    fn next(&mut self) -> Option<Self::Item> {
294        let op = &self.untyped.next()?;
295        match TypedInstruction::dispatch(op) {
296            Some(op) => Some(op),
297            // In case this returns `None`, the content stream is invalid. In case a path-drawing
298            // operator was used, let's abort completely, otherwise we might end up drawing random stuff.
299            // However, for other operators it could be worth it to just skip it but keep attempting
300            // to read other content operators.
301            None => {
302                if [
303                    &b"m"[..],
304                    &b"l"[..],
305                    &b"c"[..],
306                    &b"v"[..],
307                    &b"y"[..],
308                    &b"h"[..],
309                    &b"re"[..],
310                ]
311                .contains(&op.operator.0.deref())
312                {
313                    None
314                } else {
315                    Some(TypedInstruction::Fallback(op.operator.clone()))
316                }
317            }
318        }
319    }
320}
321
322/// An instruction (= operator and its operands) in a content stream.
323pub struct Instruction<'a> {
324    /// The stack containing the operands.
325    pub operands: Stack<'a>,
326    /// The actual operator.
327    pub operator: Operator<'a>,
328}
329
330impl<'a> Instruction<'a> {
331    /// An iterator over the operands of the instruction.
332    pub fn operands(self) -> OperandIterator<'a> {
333        OperandIterator::new(self.operands)
334    }
335}
336
337/// A stack holding the arguments of an operator.
338#[derive(Debug, Clone, PartialEq)]
339pub struct Stack<'a>(SmallVec<[Object<'a>; OPERANDS_THRESHOLD]>);
340
341impl<'a> Default for Stack<'a> {
342    fn default() -> Self {
343        Self::new()
344    }
345}
346
347impl<'a> Stack<'a> {
348    /// Create a new, empty stack.
349    pub fn new() -> Self {
350        Self(SmallVec::new())
351    }
352
353    fn push(&mut self, operand: Object<'a>) {
354        self.0.push(operand);
355    }
356
357    fn clear(&mut self) {
358        self.0.clear();
359    }
360
361    fn len(&self) -> usize {
362        self.0.len()
363    }
364
365    fn get<T>(&self, index: usize) -> Option<T>
366    where
367        T: ObjectLike<'a>,
368    {
369        self.0.get(index).and_then(|e| e.clone().cast::<T>())
370    }
371
372    fn get_all<T>(&self) -> Option<SmallVec<[T; OPERANDS_THRESHOLD]>>
373    where
374        T: ObjectLike<'a>,
375    {
376        let mut operands = SmallVec::new();
377
378        for op in &self.0 {
379            let converted = op.clone().cast::<T>()?;
380            operands.push(converted);
381        }
382
383        Some(operands)
384    }
385}
386
387/// An iterator over the operands of an operator.
388pub struct OperandIterator<'a> {
389    stack: Stack<'a>,
390    cur_index: usize,
391}
392
393impl<'a> OperandIterator<'a> {
394    fn new(stack: Stack<'a>) -> Self {
395        Self {
396            stack,
397            cur_index: 0,
398        }
399    }
400}
401
402impl<'a> Iterator for OperandIterator<'a> {
403    type Item = Object<'a>;
404
405    fn next(&mut self) -> Option<Self::Item> {
406        if let Some(item) = self.stack.get::<Object<'a>>(self.cur_index) {
407            self.cur_index += 1;
408
409            Some(item)
410        } else {
411            None
412        }
413    }
414}
415
416pub(crate) trait OperatorTrait<'a>
417where
418    Self: Sized + Into<TypedInstruction<'a>> + TryFrom<TypedInstruction<'a>>,
419{
420    const OPERATOR: &'static str;
421
422    fn from_stack(stack: &Stack<'a>) -> Option<Self>;
423}
424
425mod macros {
426    macro_rules! op_impl {
427        ($t:ident $(<$l:lifetime>),*, $e:expr, $n:expr, $body:expr) => {
428            impl<'a> OperatorTrait<'a> for $t$(<$l>),* {
429                const OPERATOR: &'static str = $e;
430
431                fn from_stack(stack: &Stack<'a>) -> Option<Self> {
432                    if $n != u8::MAX as usize {
433                        if stack.len() != $n {
434                            warn!("wrong stack length {} for operator {}, expected {}", stack.len(), Self::OPERATOR, $n);
435                        }
436                    }
437
438                    $body(stack).or_else(|| {
439                        warn!("failed to convert operands for operator {}", Self::OPERATOR);
440
441                        None
442                    })
443                }
444            }
445
446            impl<'a> From<$t$(<$l>),*> for TypedInstruction<'a> {
447                fn from(value: $t$(<$l>),*) -> Self {
448                    TypedInstruction::$t(value)
449                }
450            }
451
452            impl<'a> TryFrom<TypedInstruction<'a>> for $t$(<$l>),* {
453                type Error = ();
454
455                fn try_from(value: TypedInstruction<'a>) -> std::result::Result<Self, Self::Error> {
456                    match value {
457                        TypedInstruction::$t(e) => Ok(e),
458                        _ => Err(())
459                    }
460                }
461            }
462        };
463    }
464
465    // The `shift` parameter will always be 0 in valid PDFs. The purpose of the parameter is
466    // so that in case there are garbage operands in the content stream, we prefer to use
467    // the operands that are closer to the operator instead of the values at the bottom
468    // of the stack.
469
470    macro_rules! op0 {
471        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
472            crate::content::macros::op_impl!($t$(<$l>),*, $e, 0, |_| Some(Self));
473        }
474    }
475
476    macro_rules! op1 {
477        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
478            crate::content::macros::op_impl!($t$(<$l>),*, $e, 1, |stack: &Stack<'a>| {
479                let shift = stack.len().saturating_sub(1);
480                Some(Self(stack.get(0 + shift)?))
481            });
482        }
483    }
484
485    macro_rules! op_all {
486        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
487            crate::content::macros::op_impl!($t$(<$l>),*, $e, u8::MAX as usize, |stack: &Stack<'a>|
488            Some(Self(stack.get_all()?)));
489        }
490    }
491
492    macro_rules! op2 {
493        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
494            crate::content::macros::op_impl!($t$(<$l>),*, $e, 2, |stack: &Stack<'a>| {
495                let shift = stack.len().saturating_sub(2);
496                Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?))
497            });
498        }
499    }
500
501    macro_rules! op3 {
502        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
503            crate::content::macros::op_impl!($t$(<$l>),*, $e, 3, |stack: &Stack<'a>| {
504                let shift = stack.len().saturating_sub(3);
505                Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?,
506                stack.get(2 + shift)?))
507            });
508        }
509    }
510
511    macro_rules! op4 {
512        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
513            crate::content::macros::op_impl!($t$(<$l>),*, $e, 4, |stack: &Stack<'a>| {
514               let shift = stack.len().saturating_sub(4);
515            Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?,
516            stack.get(2 + shift)?, stack.get(3 + shift)?))
517            });
518        }
519    }
520
521    macro_rules! op6 {
522        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
523            crate::content::macros::op_impl!($t$(<$l>),*, $e, 6, |stack: &Stack<'a>| {
524                let shift = stack.len().saturating_sub(6);
525            Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?,
526            stack.get(2 + shift)?, stack.get(3 + shift)?,
527            stack.get(4 + shift)?, stack.get(5 + shift)?))
528            });
529        }
530    }
531
532    pub(crate) use op_all;
533    pub(crate) use op_impl;
534    pub(crate) use op0;
535    pub(crate) use op1;
536    pub(crate) use op2;
537    pub(crate) use op3;
538    pub(crate) use op4;
539    pub(crate) use op6;
540}