pdf-syntax 0.5.0

/*!
PDF content operators.

This module provides facilities to read and interpret PDF content streams using
high-level types.

```
use pdf_syntax::object::Number;
use pdf_syntax::content::*;
use pdf_syntax::content::ops::*;

let content_stream = b"1 0 0 -1 0 200 cm
0 1.0 0 rg
0 0 m
200 0 l
200 200 l
0 200 l
h
f";

let mut iter = TypedIter::new(content_stream);
assert!(matches!(iter.next(), Some(TypedInstruction::Transform(_))));
assert!(matches!(iter.next(), Some(TypedInstruction::NonStrokeColorDeviceRgb(_))));
assert!(matches!(iter.next(), Some(TypedInstruction::MoveTo(_))));
assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
assert!(matches!(iter.next(), Some(TypedInstruction::LineTo(_))));
assert!(matches!(iter.next(), Some(TypedInstruction::ClosePath(_))));
assert!(matches!(iter.next(), Some(TypedInstruction::FillPathNonZero(_))));
```
*/

#[allow(missing_docs)]
pub mod ops;

use crate::content::ops::TypedInstruction;
use crate::object::Stream;
use crate::object::dict::InlineImageDict;
use crate::object::dict::keys::{
    ASCII85_DECODE_ABBREVIATION, BITS_PER_COMPONENT, BPC, COLORSPACE, CS, F, FILTER, H, HEIGHT, IM,
    IMAGE_MASK, W, WIDTH,
};
use crate::object::name::{Name, skip_name_like};
use crate::object::{Array, Number, Object, ObjectLike};
use crate::reader::Reader;
use crate::reader::{Readable, ReaderContext, ReaderExt, Skippable};
use core::fmt::{Debug, Formatter};
use core::ops::Deref;
use log::warn;
use smallvec::SmallVec;

// 6 operands are used for example for ctm or cubic curves,
// but anything above should be pretty rare (only for example for
// DeviceN color spaces)
const OPERANDS_THRESHOLD: usize = 6;

/// For unfiltered raw inline images, compute the exact byte count of the image data:
///   `H × ceil(W × BPC × num_components / 8)`
///
/// Returns `None` when the image has any filter (can't use this formula) or when
/// the required width/height parameters are missing or the color space is unrecognised.
fn compute_raw_inline_image_size(dict: &crate::object::dict::Dict<'_>) -> Option<usize> {
    // If there is any filter, the data is compressed — can't use raw size formula.
    let has_filter = dict.get::<Name>(F).is_some()
        || dict.get::<Name>(FILTER).is_some()
        || dict.get::<Array>(F).is_some()
        || dict.get::<Array>(FILTER).is_some();
    if has_filter {
        return None;
    }

    // /IM true → 1-component bilevel image, always 1 bpc.
    let is_image_mask =
        dict.get::<bool>(IM).unwrap_or(false) || dict.get::<bool>(IMAGE_MASK).unwrap_or(false);

    let w = dict
        .get::<Number>(W)
        .or_else(|| dict.get::<Number>(WIDTH))?
        .as_f64() as usize;
    let h = dict
        .get::<Number>(H)
        .or_else(|| dict.get::<Number>(HEIGHT))?
        .as_f64() as usize;

    let (bpc, components): (usize, usize) = if is_image_mask {
        (1, 1)
    } else {
        let bpc = dict
            .get::<Number>(BPC)
            .or_else(|| dict.get::<Number>(BITS_PER_COMPONENT))
            .map(|n| n.as_f64() as usize)
            .unwrap_or(8);
        let cs_name: Option<Vec<u8>> = dict
            .get::<Name>(CS)
            .map(|n| n.as_ref().to_vec())
            .or_else(|| dict.get::<Name>(COLORSPACE).map(|n| n.as_ref().to_vec()));
        let components = match cs_name.as_deref() {
            Some(b"G") | Some(b"DeviceGray") | Some(b"I") | Some(b"Indexed") => 1,
            Some(b"RGB") | Some(b"DeviceRGB") => 3,
            Some(b"CMYK") | Some(b"DeviceCMYK") => 4,
            _ => return None, // Unknown color space — can't compute size
        };
        (bpc, components)
    };

    // Row stride in bytes (rounded up to byte boundary), total size = h * stride.
    let bits_per_row = w * bpc * components;
    let stride = bits_per_row.div_ceil(8);
    Some(h * stride)
}

/// For inline images encoded with ASCII85 (/A85 or /ASCII85Decode as the outermost
/// filter), the end-of-stream is always `~>` followed by optional whitespace then `EI`.
/// Scan for that pattern in `stream_data` and return `(image_data_end, advance)` where:
///   - `image_data_end`: exclusive end of the raw image bytes (includes `~>`)
///   - `advance`: how many bytes to advance the reader from `stream_data[0]` to land
///     just past the `EI` (i.e., `image_data_end + whitespace + 2`)
///
/// Returns `None` if the pattern is not found or the inline image dict's outermost
/// filter is not ASCII85.
fn find_a85_inline_image_end(
    stream_data: &[u8],
    dict: &crate::object::dict::Dict<'_>,
) -> Option<(usize, usize)> {
    // Determine the outermost filter from /F or /Filter.
    let outermost: Option<Vec<u8>> = dict
        .get::<Name>(F)
        .map(|n| n.as_ref().to_vec())
        .or_else(|| dict.get::<Name>(FILTER).map(|n| n.as_ref().to_vec()))
        .or_else(|| {
            dict.get::<Array>(F)
                .and_then(|a| a.iter::<Name>().next())
                .map(|n| n.as_ref().to_vec())
        })
        .or_else(|| {
            dict.get::<Array>(FILTER)
                .and_then(|a| a.iter::<Name>().next())
                .map(|n| n.as_ref().to_vec())
        });

    let is_a85 = matches!(
        outermost.as_deref(),
        Some(ASCII85_DECODE_ABBREVIATION) | Some(b"ASCII85Decode")
    );
    if !is_a85 {
        return None;
    }

    // Find `~>` followed by optional whitespace then `EI` (+ whitespace or end).
    let mut i = 0;
    while i + 2 <= stream_data.len() {
        if stream_data[i] == b'~' && stream_data[i + 1] == b'>' {
            let eos_end = i + 2;
            let mut ei_start = eos_end;
            while ei_start < stream_data.len()
                && matches!(stream_data[ei_start], b' ' | b'\t' | b'\n' | b'\r' | 0x0C)
            {
                ei_start += 1;
            }
            if stream_data.get(ei_start..ei_start + 2) == Some(b"EI") {
                let after_ei = ei_start + 2;
                let ei_delimited = after_ei >= stream_data.len()
                    || matches!(stream_data[after_ei], b' ' | b'\t' | b'\n' | b'\r' | 0x0C);
                if ei_delimited {
                    return Some((eos_end, after_ei));
                }
            }
            // `~>` found but not followed by EI — keep scanning (malformed, try further).
        }
        i += 1;
    }
    None
}

impl Debug for Operator {
    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
        write!(f, "{}", self.0.as_str())
    }
}

/// A content stream operator.
#[derive(Clone, PartialEq)]
pub struct Operator(Name);

impl Deref for Operator {
    type Target = [u8];

    fn deref(&self) -> &Self::Target {
        self.0.as_ref()
    }
}

impl Skippable for Operator {
    fn skip(r: &mut Reader<'_>, _: bool) -> Option<()> {
        skip_name_like(r, false).map(|_| ())
    }
}

impl Readable<'_> for Operator {
    fn read(r: &mut Reader<'_>, _: &ReaderContext<'_>) -> Option<Self> {
        let start = r.offset();
        skip_name_like(r, false)?;
        let end = r.offset();
        let data = r.range(start..end)?;

        if data.is_empty() {
            return None;
        }

        Some(Self(Name::new(data)))
    }
}

/// An iterator over operators in the PDF content streams, providing raw access to the instructions.
#[derive(Clone)]
pub struct UntypedIter<'a> {
    reader: Reader<'a>,
    stack: Stack<'a>,
}

impl<'a> UntypedIter<'a> {
    /// Create a new untyped iterator.
    pub fn new(data: &'a [u8]) -> Self {
        Self {
            reader: Reader::new(data),
            stack: Stack::new(),
        }
    }

    /// Create a new empty untyped iterator.
    pub fn empty() -> Self {
        Self {
            reader: Reader::new(&[]),
            stack: Stack::new(),
        }
    }
}

impl<'a> Iterator for UntypedIter<'a> {
    type Item = Instruction<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        self.stack.clear();

        self.reader.skip_white_spaces_and_comments();

        while !self.reader.at_end() {
            // I believe booleans/null never appear as an operator?
            if matches!(
                self.reader.peek_byte()?,
                b'/' | b'.' | b'+' | b'-' | b'0'..=b'9' | b'[' | b'<' | b'('
            ) {
                // See issue 994. In all sane scenarios, if the next byte is a number
                // it has to be an operand (a number). However, it's possible that
                // the number is followed by a regular character, in which case it
                // should behave more like an operator (even though there exists
                // no operator that starts with a number). In order to preserve
                // similar behavior to Acrobat and Chromium, we try to consume
                // such an operator and then simply skip it.
                if let Some(object) = self.reader.read_without_context::<Object<'_>>() {
                    self.stack.push(object);
                } else if self.reader.read_without_context::<Operator>().is_some() {
                    self.stack.clear();
                } else {
                    return None;
                }
            } else {
                let operator = match self.reader.read_without_context::<Operator>() {
                    Some(o) => o,
                    None => {
                        warn!("failed to read operator in content stream");

                        self.reader.jump_to_end();
                        return None;
                    }
                };

                // Inline images need special casing...
                if operator.as_ref() == b"BI" {
                    // The ID operator will already be consumed by this.
                    let inline_dict = self.reader.read_without_context::<InlineImageDict<'_>>()?;
                    let dict = inline_dict.get_dict().clone();

                    // One whitespace after "ID".
                    self.reader.read_white_space()?;

                    let stream_data = self.reader.tail()?;
                    let start_offset = self.reader.offset();

                    // Fast path for ASCII85-encoded inline images: scan for `~>` EI
                    // instead of using the heuristic EI scanner, which fails for A85
                    // data because all bytes are printable ASCII (no binary sentinel).
                    if let Some((image_end, advance)) =
                        find_a85_inline_image_end(stream_data, &dict)
                    {
                        let image_data = &stream_data[..image_end];
                        let stream = Stream::new(image_data, dict.clone());
                        self.stack.push(Object::Stream(stream));
                        self.reader.read_bytes(advance)?;
                        self.reader.skip_white_spaces();

                        return Some(Instruction {
                            operands: core::mem::take(&mut self.stack),
                            operator,
                        });
                    }

                    // Fast path for unfiltered (raw binary) inline images: the image
                    // data size is exactly H × ceil(W × BPC × components / 8) bytes,
                    // so we can seek past the data directly and look for EI there.
                    // This avoids the heuristic EI scanner, which incorrectly skips
                    // valid EI markers when subsequent images contain binary data.
                    if let Some(raw_size) = compute_raw_inline_image_size(&dict)
                        && stream_data.len() >= raw_size
                    {
                        let image_data = &stream_data[..raw_size];
                        let stream = Stream::new(image_data, dict.clone());
                        self.stack.push(Object::Stream(stream));
                        // Skip past the raw data, then skip any whitespace before EI.
                        self.reader.read_bytes(raw_size)?;
                        self.reader.skip_white_spaces();
                        // Consume the EI operator (2 bytes).
                        let _ = self.reader.read_bytes(2);
                        self.reader.skip_white_spaces();

                        return Some(Instruction {
                            operands: core::mem::take(&mut self.stack),
                            operator,
                        });
                    }

                    'outer: while let Some(bytes) = self.reader.peek_bytes(2) {
                        if bytes == b"EI" {
                            let end_offset = self.reader.offset() - start_offset;
                            let image_data = &stream_data[..end_offset];

                            let stream = Stream::new(image_data, dict.clone());

                            // Note that there is a possibility that the encoded stream data
                            // contains the "EI" operator as part of the data, in which case we
                            // cannot confidently know whether we have hit the actual end of the
                            // stream. See also <https://github.com/pdf-association/pdf-issues/issues/543>
                            // PDF 2.0 does have a `/Length` attribute we can read, but since it's relatively
                            // new we don't bother trying to read it.
                            let tail = &self.reader.tail()?[2..];
                            let mut find_reader = Reader::new(tail);

                            while let Some(bytes) = find_reader.peek_bytes(2) {
                                if bytes == b"EI" {
                                    let analyze_data = &tail;

                                    // If there is any binary data in-between, we for sure
                                    // have not reached the end.
                                    if analyze_data.iter().any(|c| !c.is_ascii()) {
                                        self.reader.read_bytes(2)?;
                                        continue 'outer;
                                    }

                                    // Otherwise, the only possibility that we reached an
                                    // "EI", even though the previous one was valid, is
                                    // that it's part of a string in the content
                                    // stream that follows the inline image. Therefore,
                                    // it should be valid to interpret `tail` as a content
                                    // stream and there should be at least one text-related
                                    // operator that can be parsed correctly.

                                    let iter = TypedIter::new(tail);
                                    let mut found = false;

                                    for (counter, op) in iter.enumerate() {
                                        // If we have read more than 20 valid operators, it should be
                                        // safe to assume that we are in a content stream, so abort
                                        // early. The only situation where this could reasonably
                                        // be violated is if we have 20 subsequent instances of
                                        // q/Q in the image data, which seems very unlikely.
                                        if counter >= 20 {
                                            found = true;
                                            break;
                                        }

                                        if matches!(
                                            op,
                                            TypedInstruction::NextLineAndShowText(_)
                                                | TypedInstruction::ShowText(_)
                                                | TypedInstruction::ShowTexts(_)
                                                | TypedInstruction::ShowTextWithParameters(_)
                                        ) {
                                            // Now it should be safe to assume that the
                                            // previous `EI` was the correct one.
                                            found = true;
                                            break;
                                        }
                                    }

                                    if !found {
                                        // Seems like the data in-between is not a valid content
                                        // stream, so we are likely still within the image data.
                                        self.reader.read_bytes(2)?;
                                        continue 'outer;
                                    }
                                } else if bytes == b"BI" {
                                    // Possibly another inline image, if so, the previously found "EI"
                                    // is indeed the end of data.
                                    let mut cloned = find_reader.clone();
                                    cloned.read_bytes(2)?;
                                    if cloned
                                        .read_without_context::<InlineImageDict<'_>>()
                                        .is_some()
                                    {
                                        break;
                                    }
                                }

                                find_reader.read_byte()?;
                            }

                            self.stack.push(Object::Stream(stream));

                            self.reader.read_bytes(2)?;
                            self.reader.skip_white_spaces();

                            break;
                        } else {
                            self.reader.read_byte()?;
                        }
                    }
                }

                return Some(Instruction {
                    operands: core::mem::take(&mut self.stack),
                    operator,
                });
            }

            self.reader.skip_white_spaces_and_comments();
        }

        None
    }
}

/// An iterator over PDF content streams that provide access to the instructions
/// in a typed fashion.
#[derive(Clone)]
pub struct TypedIter<'a> {
    untyped: UntypedIter<'a>,
}

impl<'a> TypedIter<'a> {
    /// Create a new typed iterator.
    pub fn new(data: &'a [u8]) -> Self {
        Self {
            untyped: UntypedIter::new(data),
        }
    }

    pub(crate) fn from_untyped(untyped: UntypedIter<'a>) -> Self {
        Self { untyped }
    }
}

impl<'a> Iterator for TypedIter<'a> {
    type Item = TypedInstruction<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        let op = &self.untyped.next()?;
        match TypedInstruction::dispatch(op) {
            Some(op) => Some(op),
            // In case this returns `None`, the content stream is invalid. In case a path-drawing
            // operator was used, let's abort completely, otherwise we might end up drawing random stuff.
            // However, for other operators it could be worth it to just skip it but keep attempting
            // to read other content operators.
            None => {
                if [
                    &b"m"[..],
                    &b"l"[..],
                    &b"c"[..],
                    &b"v"[..],
                    &b"y"[..],
                    &b"h"[..],
                    &b"re"[..],
                ]
                .contains(&op.operator.0.deref())
                {
                    None
                } else {
                    Some(TypedInstruction::Fallback(op.operator.clone()))
                }
            }
        }
    }
}

/// An instruction (= operator and its operands) in a content stream.
pub struct Instruction<'a> {
    /// The stack containing the operands.
    pub operands: Stack<'a>,
    /// The actual operator.
    pub operator: Operator,
}

impl<'a> Instruction<'a> {
    /// An iterator over the operands of the instruction.
    pub fn operands(self) -> OperandIterator<'a> {
        OperandIterator::new(self.operands)
    }
}

/// A stack holding the arguments of an operator.
#[derive(Debug, Clone, PartialEq)]
pub struct Stack<'a>(SmallVec<[Object<'a>; OPERANDS_THRESHOLD]>);

impl<'a> Default for Stack<'a> {
    fn default() -> Self {
        Self::new()
    }
}

impl<'a> Stack<'a> {
    /// Create a new, empty stack.
    pub fn new() -> Self {
        Self(SmallVec::new())
    }

    fn push(&mut self, operand: Object<'a>) {
        self.0.push(operand);
    }

    fn clear(&mut self) {
        self.0.clear();
    }

    fn len(&self) -> usize {
        self.0.len()
    }

    fn get<T>(&self, index: usize) -> Option<T>
    where
        T: ObjectLike<'a>,
    {
        self.0.get(index).and_then(|e| e.clone().cast::<T>())
    }

    fn get_all<T>(&self) -> Option<SmallVec<[T; OPERANDS_THRESHOLD]>>
    where
        T: ObjectLike<'a>,
    {
        let mut operands = SmallVec::new();

        for op in &self.0 {
            let converted = op.clone().cast::<T>()?;
            operands.push(converted);
        }

        Some(operands)
    }
}

/// An iterator over the operands of an operator.
pub struct OperandIterator<'a> {
    stack: Stack<'a>,
    cur_index: usize,
}

impl<'a> OperandIterator<'a> {
    fn new(stack: Stack<'a>) -> Self {
        Self {
            stack,
            cur_index: 0,
        }
    }
}

impl<'a> Iterator for OperandIterator<'a> {
    type Item = Object<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        if let Some(item) = self.stack.get::<Object<'a>>(self.cur_index) {
            self.cur_index += 1;

            Some(item)
        } else {
            None
        }
    }
}

pub(crate) trait OperatorTrait<'a>
where
    Self: Sized + Into<TypedInstruction<'a>> + TryFrom<TypedInstruction<'a>>,
{
    const OPERATOR: &'static str;

    fn from_stack(stack: &Stack<'a>) -> Option<Self>;
}

mod macros {
    macro_rules! op_impl {
        ($t:ident $(<$l:lifetime>),*, $e:expr, $n:expr, $body:expr) => {
            impl<'a> OperatorTrait<'a> for $t$(<$l>),* {
                const OPERATOR: &'static str = $e;

                fn from_stack(stack: &Stack<'a>) -> Option<Self> {
                    if $n != u8::MAX as usize {
                        if stack.len() != $n {
                            warn!("wrong stack length {} for operator {}, expected {}", stack.len(), Self::OPERATOR, $n);
                        }
                    }

                    $body(stack).or_else(|| {
                        warn!("failed to convert operands for operator {}", Self::OPERATOR);

                        None
                    })
                }
            }

            impl<'a> From<$t$(<$l>),*> for TypedInstruction<'a> {
                fn from(value: $t$(<$l>),*) -> Self {
                    TypedInstruction::$t(value)
                }
            }

            impl<'a> TryFrom<TypedInstruction<'a>> for $t$(<$l>),* {
                type Error = ();

                fn try_from(value: TypedInstruction<'a>) -> core::result::Result<Self, Self::Error> {
                    match value {
                        TypedInstruction::$t(e) => Ok(e),
                        _ => Err(())
                    }
                }
            }
        };
    }

    // The `shift` parameter will always be 0 in valid PDFs. The purpose of the parameter is
    // so that in case there are garbage operands in the content stream, we prefer to use
    // the operands that are closer to the operator instead of the values at the bottom
    // of the stack.

    macro_rules! op0 {
        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
            crate::content::macros::op_impl!($t$(<$l>),*, $e, 0, |_| Some(Self));
        }
    }

    macro_rules! op1 {
        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
            crate::content::macros::op_impl!($t$(<$l>),*, $e, 1, |stack: &Stack<'a>| {
                let shift = stack.len().saturating_sub(1);
                Some(Self(stack.get(0 + shift)?))
            });
        }
    }

    macro_rules! op_all {
        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
            crate::content::macros::op_impl!($t$(<$l>),*, $e, u8::MAX as usize, |stack: &Stack<'a>|
            Some(Self(stack.get_all()?)));
        }
    }

    macro_rules! op2 {
        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
            crate::content::macros::op_impl!($t$(<$l>),*, $e, 2, |stack: &Stack<'a>| {
                let shift = stack.len().saturating_sub(2);
                Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?))
            });
        }
    }

    macro_rules! op3 {
        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
            crate::content::macros::op_impl!($t$(<$l>),*, $e, 3, |stack: &Stack<'a>| {
                let shift = stack.len().saturating_sub(3);
                Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?,
                stack.get(2 + shift)?))
            });
        }
    }

    macro_rules! op4 {
        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
            crate::content::macros::op_impl!($t$(<$l>),*, $e, 4, |stack: &Stack<'a>| {
               let shift = stack.len().saturating_sub(4);
            Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?,
            stack.get(2 + shift)?, stack.get(3 + shift)?))
            });
        }
    }

    macro_rules! op6 {
        ($t:ident $(<$l:lifetime>),*, $e:expr) => {
            crate::content::macros::op_impl!($t$(<$l>),*, $e, 6, |stack: &Stack<'a>| {
                let shift = stack.len().saturating_sub(6);
            Some(Self(stack.get(0 + shift)?, stack.get(1 + shift)?,
            stack.get(2 + shift)?, stack.get(3 + shift)?,
            stack.get(4 + shift)?, stack.get(5 + shift)?))
            });
        }
    }

    pub(crate) use op_all;
    pub(crate) use op_impl;
    pub(crate) use op0;
    pub(crate) use op1;
    pub(crate) use op2;
    pub(crate) use op3;
    pub(crate) use op4;
    pub(crate) use op6;
}