hayro_syntax/content/
mod.rs

1//! PDF content operators.
2//!
3//! This module provides facilities to read and interpret PDF content streams using
4//! high-level types.
5
6#[allow(missing_docs)]
7pub mod ops;
8
9use crate::content::ops::TypedOperation;
10use crate::object::dict::InlineImageDict;
11use crate::object::name::{Name, skip_name_like};
12use crate::object::stream::Stream;
13use crate::object::{Object, ObjectLike};
14use crate::reader::{Readable, Reader, Skippable};
15use crate::xref::XRef;
16use log::warn;
17use smallvec::SmallVec;
18use std::fmt::{Debug, Formatter};
19use std::ops::Deref;
20
21// 6 operands are used for example for ctm or cubic curves,
22// but anything above should be pretty rare (for example for
23// DeviceN color spaces)
24const OPERANDS_THRESHOLD: usize = 6;
25
26impl Debug for Operator<'_> {
27    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
28        write!(f, "{}", self.0.as_str())
29    }
30}
31
32/// A content stream operator.
33pub struct Operator<'a>(Name<'a>);
34
35impl Deref for Operator<'_> {
36    type Target = [u8];
37
38    fn deref(&self) -> &Self::Target {
39        self.0.as_ref()
40    }
41}
42
43impl Skippable for Operator<'_> {
44    fn skip<const PLAIN: bool>(r: &mut Reader<'_>) -> Option<()> {
45        skip_name_like(r, false).map(|_| ())
46    }
47}
48
49impl<'a> Readable<'a> for Operator<'a> {
50    fn read<const PLAIN: bool>(r: &mut Reader<'a>, _: &'a XRef) -> Option<Self> {
51        let data = {
52            let start = r.offset();
53            skip_name_like(r, false)?;
54            let end = r.offset();
55            let data = r.range(start..end).unwrap();
56
57            if data.is_empty() {
58                return None;
59            }
60
61            data
62        };
63
64        Some(Operator(Name::from_unescaped(data)))
65    }
66}
67
68/// An iterator over PDF content streams that provides access to the operators
69/// in a raw manner by only exposing the operator name and its arguments on the stack.
70#[derive(Clone)]
71pub struct UntypedIter<'a> {
72    reader: Reader<'a>,
73    stack: Stack<'a>,
74}
75
76impl<'a> UntypedIter<'a> {
77    /// Create a new untyped iterator.
78    pub fn new(data: &'a [u8]) -> UntypedIter<'a> {
79        Self {
80            reader: Reader::new(data),
81            stack: Stack::new(),
82        }
83    }
84
85    /// Create a new empty untyped iterator.
86    pub fn empty() -> UntypedIter<'a> {
87        Self {
88            reader: Reader::new(&[]),
89            stack: Stack::new(),
90        }
91    }
92}
93
94impl<'a> Iterator for UntypedIter<'a> {
95    type Item = Operation<'a>;
96
97    fn next(&mut self) -> Option<Self::Item> {
98        self.stack.clear();
99
100        self.reader.skip_white_spaces_and_comments();
101
102        while !self.reader.at_end() {
103            // I believe booleans/null never appear as an operator?
104            if matches!(
105                self.reader.peek_byte()?,
106                b'/' | b'.' | b'+' | b'-' | b'0'..=b'9' | b'[' | b'<' | b'('
107            ) {
108                self.stack.push(self.reader.read_without_xref::<Object>()?);
109            } else {
110                let operator = match self.reader.read_without_xref::<Operator>() {
111                    Some(o) => o,
112                    None => {
113                        warn!("failed to read operator in content stream");
114
115                        self.reader.jump_to_end();
116                        return None;
117                    }
118                };
119
120                // Inline images need special casing...
121                if operator.as_ref() == b"BI" {
122                    // The ID operator will already be consumed by this.
123                    let inline_dict = self.reader.read_without_xref::<InlineImageDict>()?;
124                    let dict = inline_dict.get_dict().clone();
125
126                    // One whitespace after "ID".
127                    self.reader.read_byte()?;
128
129                    let stream_data = self.reader.tail()?;
130                    let start_offset = self.reader.offset();
131
132                    while let Some(bytes) = self.reader.peek_bytes(2) {
133                        if bytes == b"EI" {
134                            let end_offset = self.reader.offset() - start_offset;
135                            let image_data = &stream_data[..end_offset];
136
137                            let stream = Stream::from_raw(image_data, dict.clone());
138
139                            // Note that there is a possibility that the encoded stream data
140                            // contains the "EI" operator as part of the data, in which case we
141                            // cannot confidently know whether we have hit the actual end of the
142                            // stream. See also <https://github.com/pdf-association/pdf-issues/issues/543>
143                            // PDF 2.0 does have a `/Length` attribute we can read, but since it's relatively
144                            // new we don't bother trying to read it.
145                            // Because of this, we instead try to decode the data we currently have,
146                            // and if it doesn't work we assume that the `EI` is not the one we are
147                            // looking for and we keep searching.
148                            if stream.decoded().is_none() {
149                                self.reader.read_bytes(2);
150                                continue;
151                            }
152
153                            self.stack.push(Object::Stream(stream));
154
155                            self.reader.read_bytes(2)?;
156                            self.reader.skip_white_spaces();
157
158                            break;
159                        } else {
160                            self.reader.read_byte()?;
161                        }
162                    }
163                }
164
165                return Some(Operation {
166                    operands: self.stack.clone(),
167                    operator,
168                });
169            }
170
171            self.reader.skip_white_spaces_and_comments();
172        }
173
174        None
175    }
176}
177
178/// An iterator over PDF content streams that provide access to the operators
179/// in a typed fashion.
180#[derive(Clone)]
181pub struct TypedIter<'a> {
182    untyped: UntypedIter<'a>,
183}
184
185impl<'a> TypedIter<'a> {
186    /// Create a new typed iterator.
187    pub fn new(untyped: UntypedIter<'a>) -> TypedIter<'a> {
188        Self { untyped }
189    }
190}
191
192impl<'a> Iterator for TypedIter<'a> {
193    type Item = TypedOperation<'a>;
194
195    fn next(&mut self) -> Option<Self::Item> {
196        self.untyped
197            .next()
198            .and_then(|op| TypedOperation::dispatch(&op))
199    }
200}
201
202/// An operation in a content stream.
203pub struct Operation<'a> {
204    /// The operands of the operator.
205    pub operands: Stack<'a>,
206    /// The actual operator.
207    pub operator: Operator<'a>,
208}
209
210impl<'a> Operation<'a> {
211    /// An iterator over the operands of the operation.
212    pub fn operands(self) -> OperandIterator<'a> {
213        OperandIterator::new(self.operands)
214    }
215}
216
217/// A stack holding the values for an operation.
218#[derive(Debug, Clone, PartialEq)]
219pub struct Stack<'a>(SmallVec<[Object<'a>; OPERANDS_THRESHOLD]>);
220
221impl<'a> Stack<'a> {
222    /// Create a new, empty stack.
223    pub fn new() -> Self {
224        Self(SmallVec::new())
225    }
226
227    fn push(&mut self, operand: Object<'a>) {
228        self.0.push(operand);
229    }
230
231    fn clear(&mut self) {
232        self.0.clear();
233    }
234
235    fn len(&self) -> usize {
236        self.0.len()
237    }
238
239    fn get<T>(&self, index: usize) -> Option<T>
240    where
241        T: ObjectLike<'a>,
242    {
243        self.0.get(index).and_then(|e| e.clone().cast::<T>())
244    }
245
246    fn get_all<T>(&self) -> Option<SmallVec<[T; OPERANDS_THRESHOLD]>>
247    where
248        T: ObjectLike<'a>,
249    {
250        let mut operands = SmallVec::new();
251
252        for op in &self.0 {
253            let converted = op.clone().cast::<T>()?;
254            operands.push(converted);
255        }
256
257        Some(operands)
258    }
259}
260
261/// An iterator over the operands of an operations.
262pub struct OperandIterator<'a> {
263    stack: Stack<'a>,
264    cur_index: usize,
265}
266
267impl<'a> OperandIterator<'a> {
268    fn new(stack: Stack<'a>) -> Self {
269        Self {
270            stack,
271            cur_index: 0,
272        }
273    }
274}
275
276impl<'a> Iterator for OperandIterator<'a> {
277    type Item = Object<'a>;
278
279    fn next(&mut self) -> Option<Self::Item> {
280        if let Some(item) = self.stack.get::<Object<'a>>(self.cur_index) {
281            self.cur_index += 1;
282            Some(item)
283        } else {
284            None
285        }
286    }
287}
288
289pub(crate) trait OperatorTrait<'a>
290where
291    Self: Sized + Into<TypedOperation<'a>> + TryFrom<TypedOperation<'a>>,
292{
293    const OPERATOR: &'static str;
294
295    fn from_stack(stack: &Stack<'a>) -> Option<Self>;
296}
297
298mod macros {
299    macro_rules! op_impl {
300        ($t:ident $(<$l:lifetime>),*, $e:expr, $n:expr, $body:expr) => {
301            impl<'a> OperatorTrait<'a> for $t$(<$l>),* {
302                const OPERATOR: &'static str = $e;
303
304                fn from_stack(stack: &Stack<'a>) -> Option<Self> {
305                    if $n != u8::MAX as usize {
306                        if stack.len() != $n {
307                            warn!("wrong stack length {} for operator {}, expected {}", stack.len(), Self::OPERATOR, $n);
308
309                            return None;
310                        }
311                    }
312
313                    $body(stack).or_else(|| {
314                        warn!("failed to convert operands for operator {}", Self::OPERATOR);
315
316                        None
317                    })
318                }
319            }
320
321            impl<'a> From<$t$(<$l>),*> for TypedOperation<'a> {
322                fn from(value: $t$(<$l>),*) -> Self {
323                    TypedOperation::$t(value)
324                }
325            }
326
327            impl<'a> TryFrom<TypedOperation<'a>> for $t$(<$l>),* {
328                type Error = ();
329
330                fn try_from(value: TypedOperation<'a>) -> std::result::Result<Self, Self::Error> {
331                    match value {
332                        TypedOperation::$t(e) => Ok(e),
333                        _ => Err(())
334                    }
335                }
336            }
337        };
338    }
339
340    macro_rules! op0 {
341        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
342            crate::content::macros::op_impl!($t$(<$l>),*, $e, 0, |_| Some(Self));
343        }
344    }
345
346    macro_rules! op1 {
347        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
348            crate::content::macros::op_impl!($t$(<$l>),*, $e, 1, |stack: &Stack<'a>|
349            Some(Self(stack.get(0)?)));
350        }
351    }
352
353    macro_rules! op_all {
354        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
355            crate::content::macros::op_impl!($t$(<$l>),*, $e, u8::MAX as usize, |stack: &Stack<'a>|
356            Some(Self(stack.get_all()?)));
357        }
358    }
359
360    macro_rules! op2 {
361        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
362            crate::content::macros::op_impl!($t$(<$l>),*, $e, 2, |stack: &Stack<'a>|
363            Some(Self(stack.get(0)?, stack.get(1)?)));
364        }
365    }
366
367    macro_rules! op3 {
368        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
369            crate::content::macros::op_impl!($t$(<$l>),*, $e, 3, |stack: &Stack<'a>|
370            Some(Self(stack.get(0)?, stack.get(1)?,
371            stack.get(2)?)));
372        }
373    }
374
375    macro_rules! op4 {
376        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
377            crate::content::macros::op_impl!($t$(<$l>),*, $e, 4, |stack: &Stack<'a>|
378            Some(Self(stack.get(0)?, stack.get(1)?,
379            stack.get(2)?, stack.get(3)?)));
380        }
381    }
382
383    macro_rules! op6 {
384        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
385            crate::content::macros::op_impl!($t$(<$l>),*, $e, 6, |stack: &Stack<'a>|
386            Some(Self(stack.get(0)?, stack.get(1)?,
387            stack.get(2)?, stack.get(3)?,
388            stack.get(4)?, stack.get(5)?)));
389        }
390    }
391
392    pub(crate) use op_all;
393    pub(crate) use op_impl;
394    pub(crate) use op0;
395    pub(crate) use op1;
396    pub(crate) use op2;
397    pub(crate) use op3;
398    pub(crate) use op4;
399    pub(crate) use op6;
400}