hayro_syntax/content/
mod.rs

1/*!
2PDF content operators.
3
4This module provides facilities to read and interpret PDF content streams using
5high-level types.
6
7```
8use hayro_syntax::object::Number;
9use hayro_syntax::content::*;
10use hayro_syntax::content::ops::*;
11
12let content_stream = b"1 0 0 -1 0 200 cm
130 1.0 0 rg
140 0 m
15200 0 l
16200 200 l
170 200 l
18h
19f";
20
21let mut iter = TypedIter::new(content_stream);
22assert!(matches!(iter.next(), Some(TypedInstruction::Transform(_))));
23assert!(matches!(iter.next(), Some(TypedInstruction::NonStrokeColorDeviceRgb(_))));
24assert!(matches!(iter.next(), Some(TypedInstruction::MoveTo(_))));
25assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
26assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
27assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
28assert!(matches!(iter.next(), Some(TypedInstruction::ClosePath(_))));
29assert!(matches!(iter.next(), Some(TypedInstruction::FillPathNonZero(_))));
30```
31*/
32
33#[allow(missing_docs)]
34pub mod ops;
35
36use crate::content::ops::TypedInstruction;
37use crate::object::Stream;
38use crate::object::dict::InlineImageDict;
39use crate::object::name::{Name, skip_name_like};
40use crate::object::{Object, ObjectLike};
41use crate::reader::{Readable, Reader, ReaderContext, Skippable};
42use log::warn;
43use smallvec::SmallVec;
44use std::fmt::{Debug, Formatter};
45use std::ops::Deref;
46
47// 6 operands are used for example for ctm or cubic curves,
48// but anything above should be pretty rare (only for example for
49// DeviceN color spaces)
50const OPERANDS_THRESHOLD: usize = 6;
51
52impl Debug for Operator<'_> {
53    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
54        write!(f, "{}", self.0.as_str())
55    }
56}
57
58/// A content stream operator.
59#[derive(Clone, PartialEq)]
60pub struct Operator<'a>(Name<'a>);
61
62impl Deref for Operator<'_> {
63    type Target = [u8];
64
65    fn deref(&self) -> &Self::Target {
66        self.0.as_ref()
67    }
68}
69
70impl Skippable for Operator<'_> {
71    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
72        skip_name_like(r, false).map(|_| ())
73    }
74}
75
76impl<'a> Readable<'a> for Operator<'a> {
77    fn read(r: &mut Reader<'a>, _: ReaderContext) -> Option<Self> {
78        let data = {
79            let start = r.offset();
80            skip_name_like(r, false)?;
81            let end = r.offset();
82            let data = r.range(start..end).unwrap();
83
84            if data.is_empty() {
85                return None;
86            }
87
88            data
89        };
90
91        Some(Operator(Name::from_unescaped(data)))
92    }
93}
94
95/// An iterator over operators in the PDF content streams, providing raw access to the instructions.
96#[derive(Clone)]
97pub struct UntypedIter<'a> {
98    reader: Reader<'a>,
99    stack: Stack<'a>,
100}
101
102impl<'a> UntypedIter<'a> {
103    /// Create a new untyped iterator.
104    pub fn new(data: &'a [u8]) -> UntypedIter<'a> {
105        Self {
106            reader: Reader::new(data),
107            stack: Stack::new(),
108        }
109    }
110
111    /// Create a new empty untyped iterator.
112    pub fn empty() -> UntypedIter<'a> {
113        Self {
114            reader: Reader::new(&[]),
115            stack: Stack::new(),
116        }
117    }
118}
119
120impl<'a> Iterator for UntypedIter<'a> {
121    type Item = Instruction<'a>;
122
123    fn next(&mut self) -> Option<Self::Item> {
124        self.stack.clear();
125
126        self.reader.skip_white_spaces_and_comments();
127
128        while !self.reader.at_end() {
129            // I believe booleans/null never appear as an operator?
130            if matches!(
131                self.reader.peek_byte()?,
132                b'/' | b'.' | b'+' | b'-' | b'0'..=b'9' | b'[' | b'<' | b'('
133            ) {
134                self.stack
135                    .push(self.reader.read_without_context::<Object>()?);
136            } else {
137                let operator = match self.reader.read_without_context::<Operator>() {
138                    Some(o) => o,
139                    None => {
140                        warn!("failed to read operator in content stream");
141
142                        self.reader.jump_to_end();
143                        return None;
144                    }
145                };
146
147                // Inline images need special casing...
148                if operator.as_ref() == b"BI" {
149                    // The ID operator will already be consumed by this.
150                    let inline_dict = self.reader.read_without_context::<InlineImageDict>()?;
151                    let dict = inline_dict.get_dict().clone();
152
153                    // One whitespace after "ID".
154                    self.reader.read_white_space()?;
155
156                    let stream_data = self.reader.tail()?;
157                    let start_offset = self.reader.offset();
158
159                    'outer: while let Some(bytes) = self.reader.peek_bytes(2) {
160                        if bytes == b"EI" {
161                            let end_offset = self.reader.offset() - start_offset;
162                            let image_data = &stream_data[..end_offset];
163
164                            let stream = Stream::from_raw(image_data, dict.clone());
165
166                            // Note that there is a possibility that the encoded stream data
167                            // contains the "EI" operator as part of the data, in which case we
168                            // cannot confidently know whether we have hit the actual end of the
169                            // stream. See also <https://github.com/pdf-association/pdf-issues/issues/543>
170                            // PDF 2.0 does have a `/Length` attribute we can read, but since it's relatively
171                            // new we don't bother trying to read it.
172                            let mut find_reader = Reader::new(&self.reader.tail()?[2..]);
173
174                            while let Some(bytes) = find_reader.peek_bytes(2) {
175                                if bytes == b"EI" {
176                                    // We found another "EI" without a corresponding "BI", so the
177                                    // EI we found above is not the end of data.
178                                    self.reader.read_bytes(2)?;
179                                    continue 'outer;
180                                } else if bytes == b"BI" {
181                                    // Possibly another inline image, if so, the previously found "EI"
182                                    // is indeed the end of data.
183                                    let mut cloned = find_reader.clone();
184                                    cloned.read_bytes(2)?;
185                                    if cloned.read_without_context::<InlineImageDict>().is_some() {
186                                        break;
187                                    }
188                                }
189
190                                find_reader.read_byte()?;
191                            }
192
193                            self.stack.push(Object::Stream(stream));
194
195                            self.reader.read_bytes(2)?;
196                            self.reader.skip_white_spaces();
197
198                            break;
199                        } else {
200                            self.reader.read_byte()?;
201                        }
202                    }
203                }
204
205                return Some(Instruction {
206                    operands: self.stack.clone(),
207                    operator,
208                });
209            }
210
211            self.reader.skip_white_spaces_and_comments();
212        }
213
214        None
215    }
216}
217
218/// An iterator over PDF content streams that provide access to the instructions
219/// in a typed fashion.
220#[derive(Clone)]
221pub struct TypedIter<'a> {
222    untyped: UntypedIter<'a>,
223}
224
225impl<'a> TypedIter<'a> {
226    /// Create a new typed iterator.
227    pub fn new(data: &'a [u8]) -> TypedIter<'a> {
228        Self {
229            untyped: UntypedIter::new(data),
230        }
231    }
232
233    pub(crate) fn from_untyped(untyped: UntypedIter<'a>) -> TypedIter<'a> {
234        Self { untyped }
235    }
236}
237
238impl<'a> Iterator for TypedIter<'a> {
239    type Item = TypedInstruction<'a>;
240
241    fn next(&mut self) -> Option<Self::Item> {
242        self.untyped
243            .next()
244            .and_then(|op| TypedInstruction::dispatch(&op))
245    }
246}
247
248/// An instruction (= operator and its operands) in a content stream.
249pub struct Instruction<'a> {
250    /// The stack containing the operands.
251    pub operands: Stack<'a>,
252    /// The actual operator.
253    pub operator: Operator<'a>,
254}
255
256impl<'a> Instruction<'a> {
257    /// An iterator over the operands of the instruction.
258    pub fn operands(self) -> OperandIterator<'a> {
259        OperandIterator::new(self.operands)
260    }
261}
262
263/// A stack holding the arguments of an operator.
264#[derive(Debug, Clone, PartialEq)]
265pub struct Stack<'a>(SmallVec<[Object<'a>; OPERANDS_THRESHOLD]>);
266
267impl<'a> Default for Stack<'a> {
268    fn default() -> Self {
269        Self::new()
270    }
271}
272
273impl<'a> Stack<'a> {
274    /// Create a new, empty stack.
275    pub fn new() -> Self {
276        Self(SmallVec::new())
277    }
278
279    fn push(&mut self, operand: Object<'a>) {
280        self.0.push(operand);
281    }
282
283    fn clear(&mut self) {
284        self.0.clear();
285    }
286
287    fn len(&self) -> usize {
288        self.0.len()
289    }
290
291    fn get<T>(&self, index: usize) -> Option<T>
292    where
293        T: ObjectLike<'a>,
294    {
295        self.0.get(index).and_then(|e| e.clone().cast::<T>())
296    }
297
298    fn get_all<T>(&self) -> Option<SmallVec<[T; OPERANDS_THRESHOLD]>>
299    where
300        T: ObjectLike<'a>,
301    {
302        let mut operands = SmallVec::new();
303
304        for op in &self.0 {
305            let converted = op.clone().cast::<T>()?;
306            operands.push(converted);
307        }
308
309        Some(operands)
310    }
311}
312
313/// An iterator over the operands of an operator.
314pub struct OperandIterator<'a> {
315    stack: Stack<'a>,
316    cur_index: usize,
317}
318
319impl<'a> OperandIterator<'a> {
320    fn new(stack: Stack<'a>) -> Self {
321        Self {
322            stack,
323            cur_index: 0,
324        }
325    }
326}
327
328impl<'a> Iterator for OperandIterator<'a> {
329    type Item = Object<'a>;
330
331    fn next(&mut self) -> Option<Self::Item> {
332        if let Some(item) = self.stack.get::<Object<'a>>(self.cur_index) {
333            self.cur_index += 1;
334
335            Some(item)
336        } else {
337            None
338        }
339    }
340}
341
342pub(crate) trait OperatorTrait<'a>
343where
344    Self: Sized + Into<TypedInstruction<'a>> + TryFrom<TypedInstruction<'a>>,
345{
346    const OPERATOR: &'static str;
347
348    fn from_stack(stack: &Stack<'a>) -> Option<Self>;
349}
350
351mod macros {
352    macro_rules! op_impl {
353        ($t:ident $(<$l:lifetime>),*, $e:expr, $n:expr, $body:expr) => {
354            impl<'a> OperatorTrait<'a> for $t$(<$l>),* {
355                const OPERATOR: &'static str = $e;
356
357                fn from_stack(stack: &Stack<'a>) -> Option<Self> {
358                    if $n != u8::MAX as usize {
359                        if stack.len() != $n {
360                            warn!("wrong stack length {} for operator {}, expected {}", stack.len(), Self::OPERATOR, $n);
361                        }
362                    }
363
364                    $body(stack).or_else(|| {
365                        warn!("failed to convert operands for operator {}", Self::OPERATOR);
366
367                        None
368                    })
369                }
370            }
371
372            impl<'a> From<$t$(<$l>),*> for TypedInstruction<'a> {
373                fn from(value: $t$(<$l>),*) -> Self {
374                    TypedInstruction::$t(value)
375                }
376            }
377
378            impl<'a> TryFrom<TypedInstruction<'a>> for $t$(<$l>),* {
379                type Error = ();
380
381                fn try_from(value: TypedInstruction<'a>) -> std::result::Result<Self, Self::Error> {
382                    match value {
383                        TypedInstruction::$t(e) => Ok(e),
384                        _ => Err(())
385                    }
386                }
387            }
388        };
389    }
390
391    macro_rules! op0 {
392        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
393            crate::content::macros::op_impl!($t$(<$l>),*, $e, 0, |_| Some(Self));
394        }
395    }
396
397    macro_rules! op1 {
398        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
399            crate::content::macros::op_impl!($t$(<$l>),*, $e, 1, |stack: &Stack<'a>|
400            Some(Self(stack.get(0)?)));
401        }
402    }
403
404    macro_rules! op_all {
405        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
406            crate::content::macros::op_impl!($t$(<$l>),*, $e, u8::MAX as usize, |stack: &Stack<'a>|
407            Some(Self(stack.get_all()?)));
408        }
409    }
410
411    macro_rules! op2 {
412        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
413            crate::content::macros::op_impl!($t$(<$l>),*, $e, 2, |stack: &Stack<'a>|
414            Some(Self(stack.get(0)?, stack.get(1)?)));
415        }
416    }
417
418    macro_rules! op3 {
419        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
420            crate::content::macros::op_impl!($t$(<$l>),*, $e, 3, |stack: &Stack<'a>|
421            Some(Self(stack.get(0)?, stack.get(1)?,
422            stack.get(2)?)));
423        }
424    }
425
426    macro_rules! op4 {
427        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
428            crate::content::macros::op_impl!($t$(<$l>),*, $e, 4, |stack: &Stack<'a>|
429            Some(Self(stack.get(0)?, stack.get(1)?,
430            stack.get(2)?, stack.get(3)?)));
431        }
432    }
433
434    macro_rules! op6 {
435        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
436            crate::content::macros::op_impl!($t$(<$l>),*, $e, 6, |stack: &Stack<'a>|
437            Some(Self(stack.get(0)?, stack.get(1)?,
438            stack.get(2)?, stack.get(3)?,
439            stack.get(4)?, stack.get(5)?)));
440        }
441    }
442
443    pub(crate) use op_all;
444    pub(crate) use op_impl;
445    pub(crate) use op0;
446    pub(crate) use op1;
447    pub(crate) use op2;
448    pub(crate) use op3;
449    pub(crate) use op4;
450    pub(crate) use op6;
451}