use crate::error::{Error, ParseError, Result, XrefError};
use crate::object::{Dictionary, Object, ObjectId, Stream, StringFormat};
use crate::xref::{Xref, XrefEntry};
pub(crate) struct Cursor<'a> {
pub(crate) buf: &'a [u8],
pub(crate) pos: usize,
}
impl<'a> Cursor<'a> {
pub fn new(buf: &'a [u8]) -> Self {
Self { buf, pos: 0 }
}
pub fn at(buf: &'a [u8], pos: usize) -> Self {
Self { buf, pos }
}
#[inline]
pub fn rest(&self) -> &'a [u8] {
&self.buf[self.pos.min(self.buf.len())..]
}
#[inline]
pub fn at_eof(&self) -> bool {
self.pos >= self.buf.len()
}
#[inline]
pub fn peek(&self) -> Option<u8> {
self.buf.get(self.pos).copied()
}
#[inline]
pub fn bump(&mut self) -> Option<u8> {
let b = self.peek()?;
self.pos += 1;
Some(b)
}
#[inline]
pub fn advance(&mut self, n: usize) {
self.pos = (self.pos + n).min(self.buf.len());
}
pub fn starts_with(&self, prefix: &[u8]) -> bool {
self.rest().starts_with(prefix)
}
pub fn eat(&mut self, prefix: &[u8]) -> bool {
if self.starts_with(prefix) {
self.advance(prefix.len());
true
} else {
false
}
}
pub fn skip_ws_and_comments(&mut self) {
loop {
match self.peek() {
Some(b) if is_ws(b) => self.advance(1),
Some(b'%') => {
while let Some(b) = self.peek() {
self.advance(1);
if b == b'\n' || b == b'\r' {
break;
}
}
}
_ => break,
}
}
}
}
#[inline]
pub(crate) fn is_ws(b: u8) -> bool {
matches!(b, b'\0' | b'\t' | b'\n' | 0x0C | b'\r' | b' ')
}
#[inline]
pub(crate) fn is_delim(b: u8) -> bool {
matches!(b, b'(' | b')' | b'<' | b'>' | b'[' | b']' | b'{' | b'}' | b'/' | b'%')
}
#[inline]
pub(crate) fn is_token_end(b: u8) -> bool {
is_ws(b) || is_delim(b)
}
pub fn parse_header(buf: &[u8]) -> Result<(String, usize)> {
const WINDOW: usize = 4096;
let window = &buf[..buf.len().min(WINDOW)];
let needle = b"%PDF-";
let start = window
.windows(needle.len())
.position(|w| w == needle)
.ok_or(ParseError::InvalidFileHeader)?;
let after = &buf[start + needle.len()..];
let mut end = 0;
for &b in after {
if is_ws(b) {
break;
}
end += 1;
}
if end == 0 || end > 8 {
return Err(ParseError::InvalidFileHeader.into());
}
let v = std::str::from_utf8(&after[..end])
.map_err(|_| ParseError::InvalidFileHeader)?
.to_string();
Ok((v, start))
}
pub fn parse_startxref(buf: &[u8]) -> Result<usize> {
const WINDOW: usize = 4096;
let from = buf.len().saturating_sub(WINDOW);
let needle = b"startxref";
let tail = &buf[from..];
let last = tail
.windows(needle.len())
.enumerate()
.filter(|(_, w)| *w == needle)
.map(|(i, _)| i)
.last()
.ok_or(ParseError::MissingStartXref)?;
let mut c = Cursor::at(buf, from + last + needle.len());
c.skip_ws_and_comments();
let n = read_integer(&mut c)?;
if n < 0 || n as usize > buf.len() {
return Err(XrefError::Start.into());
}
Ok(n as usize)
}
pub fn read_integer(c: &mut Cursor<'_>) -> Result<i64> {
let start = c.pos;
let mut negative = false;
if c.peek() == Some(b'+') {
c.bump();
} else if c.peek() == Some(b'-') {
c.bump();
negative = true;
}
let digits_start = c.pos;
while let Some(b) = c.peek() {
if b.is_ascii_digit() {
c.bump();
} else {
break;
}
}
if c.pos == digits_start {
return Err(ParseError::Unexpected {
offset: start,
expected: "integer",
}
.into());
}
let s = std::str::from_utf8(&c.buf[digits_start..c.pos])
.map_err(|_| ParseError::Unexpected {
offset: start,
expected: "integer-utf8",
})?;
let v: i64 = s.parse().map_err(|_| ParseError::Unexpected {
offset: start,
expected: "integer-parse",
})?;
Ok(if negative { -v } else { v })
}
pub fn read_number(c: &mut Cursor<'_>) -> Result<Object> {
let start = c.pos;
let mut saw_sign = false;
if matches!(c.peek(), Some(b'+') | Some(b'-')) {
c.bump();
saw_sign = true;
}
let body_start = c.pos;
let mut saw_digit = false;
let mut saw_dot = false;
while let Some(b) = c.peek() {
if b.is_ascii_digit() {
saw_digit = true;
c.bump();
} else if b == b'.' && !saw_dot {
saw_dot = true;
c.bump();
} else {
break;
}
}
if !saw_digit {
return Err(ParseError::Unexpected {
offset: start,
expected: "number",
}
.into());
}
let token_start = if saw_sign { start } else { body_start };
let s = std::str::from_utf8(&c.buf[token_start..c.pos])
.map_err(|_| ParseError::Unexpected {
offset: start,
expected: "number-utf8",
})?;
if saw_dot {
let v: f32 = s.parse().map_err(|_| ParseError::Unexpected {
offset: start,
expected: "real-parse",
})?;
Ok(Object::Real(v))
} else {
let v: i64 = s.parse().map_err(|_| ParseError::Unexpected {
offset: start,
expected: "int-parse",
})?;
Ok(Object::Integer(v))
}
}
pub fn read_name(c: &mut Cursor<'_>) -> Result<Vec<u8>> {
if c.bump() != Some(b'/') {
return Err(ParseError::Unexpected {
offset: c.pos,
expected: "name '/'",
}
.into());
}
let mut out = Vec::with_capacity(16);
while let Some(b) = c.peek() {
if is_token_end(b) {
break;
}
c.bump();
if b == b'#' {
let hi = c.bump().ok_or(ParseError::Unexpected {
offset: c.pos,
expected: "name-hex-hi",
})?;
let lo = c.bump().ok_or(ParseError::Unexpected {
offset: c.pos,
expected: "name-hex-lo",
})?;
let v = (hex_value(hi).ok_or(ParseError::Unexpected {
offset: c.pos,
expected: "name-hex-hi-value",
})? << 4)
| hex_value(lo).ok_or(ParseError::Unexpected {
offset: c.pos,
expected: "name-hex-lo-value",
})?;
out.push(v);
} else {
out.push(b);
}
}
Ok(out)
}
#[inline]
fn hex_value(b: u8) -> Option<u8> {
match b {
b'0'..=b'9' => Some(b - b'0'),
b'a'..=b'f' => Some(b - b'a' + 10),
b'A'..=b'F' => Some(b - b'A' + 10),
_ => None,
}
}
pub fn read_literal_string(c: &mut Cursor<'_>) -> Result<Vec<u8>> {
if c.bump() != Some(b'(') {
return Err(ParseError::Unexpected {
offset: c.pos,
expected: "literal-string '('",
}
.into());
}
let mut out = Vec::with_capacity(32);
let mut depth = 1usize;
while let Some(b) = c.bump() {
match b {
b'(' => {
depth += 1;
out.push(b);
}
b')' => {
depth -= 1;
if depth == 0 {
return Ok(out);
}
out.push(b);
}
b'\\' => {
let next = c.bump().ok_or(ParseError::Unexpected {
offset: c.pos,
expected: "string-escape",
})?;
match next {
b'n' => out.push(b'\n'),
b'r' => out.push(b'\r'),
b't' => out.push(b'\t'),
b'b' => out.push(8),
b'f' => out.push(12),
b'(' | b')' | b'\\' => out.push(next),
b'\n' => {}
b'\r' => {
if c.peek() == Some(b'\n') {
c.bump();
}
}
d if d.is_ascii_digit() => {
let mut v: u16 = (d - b'0') as u16;
for _ in 0..2 {
match c.peek() {
Some(b2) if b2.is_ascii_digit() && b2 < b'8' => {
c.bump();
v = v * 8 + (b2 - b'0') as u16;
}
_ => break,
}
}
out.push((v & 0xff) as u8);
}
other => out.push(other), }
}
_ => out.push(b),
}
}
Err(ParseError::Unexpected {
offset: c.pos,
expected: "literal-string-close",
}
.into())
}
pub fn read_hex_string(c: &mut Cursor<'_>) -> Result<Vec<u8>> {
if c.bump() != Some(b'<') {
return Err(ParseError::Unexpected {
offset: c.pos,
expected: "hex-string '<'",
}
.into());
}
let mut out = Vec::with_capacity(32);
let mut buf: i16 = -1;
while let Some(b) = c.bump() {
if b == b'>' {
if buf >= 0 {
out.push(((buf as u8) << 4) & 0xf0);
}
return Ok(out);
}
if is_ws(b) {
continue;
}
let v = hex_value(b).ok_or(ParseError::Unexpected {
offset: c.pos,
expected: "hex-string-digit",
})?;
if buf < 0 {
buf = v as i16;
} else {
out.push((((buf as u8) << 4) | v) & 0xff);
buf = -1;
}
}
Err(ParseError::Unexpected {
offset: c.pos,
expected: "hex-string-close",
}
.into())
}
pub fn read_object(c: &mut Cursor<'_>) -> Result<Object> {
c.skip_ws_and_comments();
let here = c.pos;
match c.peek() {
Some(b'/') => read_name(c).map(Object::Name),
Some(b'(') => read_literal_string(c).map(|b| Object::String(b, StringFormat::Literal)),
Some(b'[') => read_array(c),
Some(b'<') => {
if c.buf.get(c.pos + 1) == Some(&b'<') {
read_dictionary_or_stream(c)
} else {
read_hex_string(c).map(|b| Object::String(b, StringFormat::Hexadecimal))
}
}
Some(b't') => {
if c.eat(b"true") {
Ok(Object::Boolean(true))
} else {
Err(ParseError::Unexpected {
offset: here,
expected: "true",
}
.into())
}
}
Some(b'f') => {
if c.eat(b"false") {
Ok(Object::Boolean(false))
} else {
Err(ParseError::Unexpected {
offset: here,
expected: "false",
}
.into())
}
}
Some(b'n') => {
if c.eat(b"null") {
Ok(Object::Null)
} else {
Err(ParseError::Unexpected {
offset: here,
expected: "null",
}
.into())
}
}
Some(b) if b.is_ascii_digit() || b == b'+' || b == b'-' || b == b'.' => {
try_read_reference_or_number(c)
}
Some(_) => Err(ParseError::Unexpected {
offset: here,
expected: "object-start",
}
.into()),
None => Err(ParseError::Unexpected {
offset: here,
expected: "object (eof)",
}
.into()),
}
}
fn try_read_reference_or_number(c: &mut Cursor<'_>) -> Result<Object> {
let save = c.pos;
let first = read_number(c)?;
let after_first = c.pos;
c.skip_ws_and_comments();
let after_first_ws = c.pos;
if matches!(c.peek(), Some(b) if b.is_ascii_digit() || b == b'+' || b == b'-') {
let second = read_number(&mut Cursor::at(c.buf, c.pos));
if let Ok(Object::Integer(gen)) = second {
let mut probe = Cursor::at(c.buf, c.pos);
let _ = read_number(&mut probe);
let after_second = probe.pos;
probe.skip_ws_and_comments();
if probe.starts_with(b"R")
&& probe
.buf
.get(probe.pos + 1)
.map(|&b| is_token_end(b))
.unwrap_or(true)
{
let num = match first {
Object::Integer(n) if n >= 0 => n as u32,
_ => {
c.pos = after_first;
return Ok(first);
}
};
let gen = if (0..=u16::MAX as i64).contains(&gen) {
gen as u16
} else {
c.pos = after_first;
return Ok(first);
};
c.pos = after_second;
c.skip_ws_and_comments();
c.bump(); return Ok(Object::Reference((num, gen)));
}
}
}
c.pos = after_first_ws.min(after_first);
c.pos = after_first;
let _ = save;
Ok(first)
}
pub fn read_array(c: &mut Cursor<'_>) -> Result<Object> {
if c.bump() != Some(b'[') {
return Err(ParseError::Unexpected {
offset: c.pos,
expected: "array '['",
}
.into());
}
let mut items = Vec::new();
loop {
c.skip_ws_and_comments();
match c.peek() {
Some(b']') => {
c.bump();
return Ok(Object::Array(items));
}
Some(_) => items.push(read_object(c)?),
None => {
return Err(ParseError::Unexpected {
offset: c.pos,
expected: "array ']' (eof)",
}
.into())
}
}
}
}
pub fn read_dictionary_or_stream(c: &mut Cursor<'_>) -> Result<Object> {
let dict = read_dictionary(c)?;
c.skip_ws_and_comments();
if c.starts_with(b"stream") {
c.advance(b"stream".len());
match c.peek() {
Some(b'\n') => {
c.bump();
}
Some(b'\r') => {
c.bump();
if c.peek() == Some(b'\n') {
c.bump();
}
}
_ => {}
}
let start_position = Some(c.pos);
Ok(Object::Stream(Stream {
dict,
content: Vec::new(),
start_position,
}))
} else {
Ok(Object::Dictionary(dict))
}
}
pub fn read_dictionary(c: &mut Cursor<'_>) -> Result<Dictionary> {
if !(c.eat(b"<<")) {
return Err(ParseError::Unexpected {
offset: c.pos,
expected: "dictionary '<<'",
}
.into());
}
let mut dict = Dictionary::new();
loop {
c.skip_ws_and_comments();
if c.starts_with(b">>") {
c.advance(2);
return Ok(dict);
}
let key = read_name(c)?;
c.skip_ws_and_comments();
let value = read_object(c)?;
dict.set(key, value);
}
}
pub fn read_indirect_object_at(buf: &[u8], offset: usize) -> Result<(ObjectId, Object)> {
let mut c = Cursor::at(buf, offset);
c.skip_ws_and_comments();
let num = read_integer(&mut c)?;
if num < 0 {
return Err(ParseError::Unexpected {
offset,
expected: "non-negative object number",
}
.into());
}
c.skip_ws_and_comments();
let gen = read_integer(&mut c)?;
if !(0..=u16::MAX as i64).contains(&gen) {
return Err(ParseError::Unexpected {
offset,
expected: "u16 generation",
}
.into());
}
c.skip_ws_and_comments();
if !c.eat(b"obj") {
return Err(ParseError::Unexpected {
offset: c.pos,
expected: "'obj' keyword",
}
.into());
}
c.skip_ws_and_comments();
let body = read_object(&mut c)?;
Ok(((num as u32, gen as u16), body))
}
pub fn parse_xref_and_trailer(buf: &[u8], offset: usize) -> Result<(Xref, Dictionary)> {
if offset > buf.len() {
return Err(XrefError::Start.into());
}
let mut c = Cursor::at(buf, offset);
c.skip_ws_and_comments();
if c.eat(b"xref") {
let xref = parse_classical_xref(&mut c)?;
c.skip_ws_and_comments();
if !c.eat(b"trailer") {
return Err(ParseError::Unexpected {
offset: c.pos,
expected: "'trailer' keyword",
}
.into());
}
c.skip_ws_and_comments();
let trailer = read_dictionary(&mut c)?;
Ok((xref, trailer))
} else {
parse_xref_stream(buf, offset)
}
}
fn parse_xref_stream(buf: &[u8], offset: usize) -> Result<(Xref, Dictionary)> {
let (_id, body) = read_indirect_object_at(buf, offset)?;
let stream = match body {
Object::Stream(s) => s,
other => {
return Err(Error::Type {
expected: "Stream (xref-stream)",
found: other.type_name(),
})
}
};
let dict = stream.dict.clone();
if !dict.has_type(b"XRef") {
return Err(ParseError::Unexpected {
offset,
expected: "/Type /XRef",
}
.into());
}
let w_arr = dict.get(b"W")?.as_array()?;
if w_arr.len() < 3 {
return Err(ParseError::Other("xref-stream W has <3 fields".into()).into());
}
let w1 = read_xref_width(w_arr[0].as_i64()?)?;
let w2 = read_xref_width(w_arr[1].as_i64()?)?;
let w3 = read_xref_width(w_arr[2].as_i64()?)?;
let row_len = w1
.checked_add(w2)
.and_then(|s| s.checked_add(w3))
.ok_or_else(|| ParseError::Other("xref-stream W row overflow".into()))?;
if row_len == 0 {
return Err(ParseError::Other("xref-stream W sum is zero".into()).into());
}
let size = read_xref_count(dict.get(b"Size")?.as_i64()?)?;
let mut index_pairs: Vec<(u32, u32)> = Vec::new();
if let Some(idx_obj) = dict.get_optional(b"Index") {
let arr = idx_obj.as_array()?;
let mut i = 0;
while i + 1 < arr.len() {
let first = read_xref_count(arr[i].as_i64()?)?;
let count = read_xref_count(arr[i + 1].as_i64()?)?;
index_pairs.push((first, count));
i += 2;
}
} else {
index_pairs.push((0, size));
}
let length_obj = dict.get(b"Length")?;
let length = usize::try_from(length_obj.as_i64()?)
.map_err(|_| ParseError::Other("xref-stream /Length out of range".into()))?;
let start = stream
.start_position
.ok_or(ParseError::Other("xref-stream missing start_position".into()))?;
let end = start
.checked_add(length)
.ok_or(ParseError::Other("xref-stream start+length overflow".into()))?;
if end > buf.len() {
return Err(ParseError::Other("xref-stream extends past EOF".into()).into());
}
let raw = &buf[start..end];
let filters: Vec<String> = match dict.get_optional(b"Filter") {
Some(Object::Name(n)) => vec![String::from_utf8_lossy(n).into_owned()],
Some(Object::Array(arr)) => arr
.iter()
.filter_map(|o| match o {
Object::Name(n) => Some(String::from_utf8_lossy(n).into_owned()),
_ => None,
})
.collect(),
_ => Vec::new(),
};
let decoded = if filters.is_empty() {
raw.to_vec()
} else {
let parms = dict.get_optional(b"DecodeParms");
crate::filter::apply_chain(raw, &filters, parms)?
};
let predicted = decoded;
let _ = row_len;
let mut xref = Xref::new();
let mut cursor = 0usize;
for (first, count) in index_pairs {
for i in 0..count {
let row_end = match cursor.checked_add(row_len) {
Some(e) if e <= predicted.len() => e,
_ => break,
};
let row = &predicted[cursor..row_end];
cursor = row_end;
let kind = if w1 == 0 { 1 } else { read_be_uint(&row[..w1]) };
let field2 = read_be_uint(&row[w1..w1 + w2]);
let field3 = read_be_uint(&row[w1 + w2..]);
let entry = match kind {
0 => XrefEntry::Free,
1 => XrefEntry::Normal {
offset: field2 as u32,
generation: field3 as u16,
},
2 => XrefEntry::Compressed {
container: field2 as u32,
index: field3 as u32,
},
_ => continue,
};
let Some(obj_num) = first.checked_add(i) else {
break;
};
xref.insert(obj_num, entry);
}
}
xref.size = size;
Ok((xref, dict))
}
#[inline]
fn read_be_uint(bytes: &[u8]) -> u64 {
let mut v: u64 = 0;
for &b in bytes {
v = (v << 8) | b as u64;
}
v
}
fn read_xref_width(v: i64) -> Result<usize> {
if !(0..=8).contains(&v) {
return Err(ParseError::Other("xref-stream W width out of range".into()).into());
}
Ok(v as usize)
}
fn read_xref_count(v: i64) -> Result<u32> {
u32::try_from(v).map_err(|_| ParseError::Other("xref-stream count out of range".into()).into())
}
#[allow(dead_code)]
fn apply_xref_predictor(
data: &[u8],
dict: &Dictionary,
row_len: usize,
) -> Result<Vec<u8>> {
let params = match dict.get_optional(b"DecodeParms") {
Some(Object::Dictionary(d)) => Some(d.clone()),
_ => None,
};
let predictor = params
.as_ref()
.and_then(|d| d.get_optional(b"Predictor").and_then(|o| o.as_i64().ok()))
.unwrap_or(1);
let columns = params
.as_ref()
.and_then(|d| d.get_optional(b"Columns").and_then(|o| o.as_i64().ok()))
.map(|n| n as usize)
.unwrap_or(row_len);
if predictor == 1 {
return Ok(data.to_vec());
}
if predictor < 10 {
return Ok(data.to_vec());
}
let stride = columns + 1;
let mut out: Vec<u8> = Vec::with_capacity(data.len());
let mut prev_row: Vec<u8> = vec![0; columns];
let mut chunks = data.chunks(stride);
while let Some(row) = chunks.next() {
if row.len() < stride {
break;
}
let filter = row[0];
let data_row = &row[1..];
let mut decoded_row: Vec<u8> = Vec::with_capacity(columns);
match filter {
0 => decoded_row.extend_from_slice(data_row), 1 => {
for (i, &b) in data_row.iter().enumerate() {
let left = if i == 0 { 0 } else { decoded_row[i - 1] };
decoded_row.push(b.wrapping_add(left));
}
}
2 => {
for (i, &b) in data_row.iter().enumerate() {
decoded_row.push(b.wrapping_add(prev_row[i]));
}
}
3 => {
for (i, &b) in data_row.iter().enumerate() {
let left = if i == 0 { 0u16 } else { decoded_row[i - 1] as u16 };
let up = prev_row[i] as u16;
let avg = ((left + up) / 2) as u8;
decoded_row.push(b.wrapping_add(avg));
}
}
4 => {
for (i, &b) in data_row.iter().enumerate() {
let left = if i == 0 { 0i16 } else { decoded_row[i - 1] as i16 };
let up = prev_row[i] as i16;
let up_left = if i == 0 { 0i16 } else { prev_row[i - 1] as i16 };
let p = left + up - up_left;
let pa = (p - left).abs();
let pb = (p - up).abs();
let pc = (p - up_left).abs();
let predictor = if pa <= pb && pa <= pc {
left
} else if pb <= pc {
up
} else {
up_left
};
decoded_row.push(b.wrapping_add(predictor as u8));
}
}
_ => decoded_row.extend_from_slice(data_row),
}
out.extend_from_slice(&decoded_row);
prev_row = decoded_row;
}
let _ = row_len;
Ok(out)
}
fn parse_classical_xref(c: &mut Cursor<'_>) -> Result<Xref> {
let mut xref = Xref::new();
loop {
c.skip_ws_and_comments();
if c.starts_with(b"trailer") {
break;
}
let first = read_integer(c)?;
c.skip_ws_and_comments();
let count = read_integer(c)?;
if first < 0 || count < 0 {
return Err(XrefError::Start.into());
}
if matches!(c.peek(), Some(b'\r') | Some(b'\n')) {
if c.peek() == Some(b'\r') {
c.bump();
}
if c.peek() == Some(b'\n') {
c.bump();
}
}
for i in 0..count {
if c.pos + 18 > c.buf.len() {
return Err(XrefError::Start.into());
}
let line = &c.buf[c.pos..c.pos + 18];
let off_str = std::str::from_utf8(&line[..10]).map_err(|_| XrefError::Start)?;
let gen_str = std::str::from_utf8(&line[11..16]).map_err(|_| XrefError::Start)?;
let kind = line[17];
let offset: u64 = off_str
.trim()
.parse()
.map_err(|_| XrefError::Start)?;
let generation: u16 = gen_str
.trim()
.parse::<u32>()
.map(|n| n.min(u16::MAX as u32) as u16)
.unwrap_or(0);
let entry = match kind {
b'n' => Some(XrefEntry::Normal {
offset: offset as u32,
generation,
}),
b'f' => Some(XrefEntry::Free),
_ => None,
};
if let Some(e) = entry {
let object_number = (first as u32).saturating_add(i as u32);
xref.insert(object_number, e);
}
c.advance(18);
let mut saw_eol = false;
for _ in 0..4 {
match c.peek() {
Some(b' ') | Some(b'\t') => {
c.bump();
}
Some(b'\r') => {
c.bump();
if c.peek() == Some(b'\n') {
c.bump();
}
saw_eol = true;
break;
}
Some(b'\n') => {
c.bump();
saw_eol = true;
break;
}
_ => break,
}
}
let _ = saw_eol;
}
let after_last = (first as u32).saturating_add(count as u32);
if after_last > xref.size {
xref.size = after_last;
}
}
Ok(xref)
}