1use std::sync::Arc;
2
3use zpdf_core::{Error, ObjectId, ParseLimits, PdfDict, PdfObject, PdfStream, Result};
4
5use crate::lexer::Lexer;
6
7pub struct ObjectParser<'a> {
8 data: &'a [u8],
9 limits: &'a ParseLimits,
10}
11
12impl<'a> ObjectParser<'a> {
13 pub fn new(data: &'a [u8], limits: &'a ParseLimits) -> Self {
14 Self { data, limits }
15 }
16
17 pub fn parse_indirect_at(&self, offset: usize) -> Result<PdfObject> {
20 self.parse_indirect_with_id(offset).map(|(_, obj)| obj)
21 }
22
23 pub fn parse_indirect_with_id(&self, offset: usize) -> Result<(ObjectId, PdfObject)> {
29 let mut lex = Lexer::new(self.data, offset, self.limits);
30
31 let num_tok = lex.next_token()?;
32 let gen_tok = lex.next_token()?;
33 let id = match (&num_tok, &gen_tok) {
34 (PdfObject::Integer(n), PdfObject::Integer(g)) => {
35 match (u32::try_from(*n), u16::try_from(*g)) {
36 (Ok(n), Ok(g)) => ObjectId(n, g),
37 _ => {
38 return Err(Error::InvalidObject(
39 offset as u64,
40 format!("object header out of range: {n} {g} obj"),
41 ))
42 }
43 }
44 }
45 _ => {
46 return Err(Error::InvalidObject(
47 offset as u64,
48 "object header is not '<int> <int> obj'".into(),
49 ))
50 }
51 };
52
53 lex.skip_whitespace_and_comments();
54 self.expect_keyword(&mut lex, b"obj")?;
55
56 let obj = lex.next_token()?;
57 let obj = lex.maybe_resolve_ref(obj)?;
61
62 lex.skip_whitespace_and_comments();
64 if let PdfObject::Dict(dict) = &obj {
65 if self.starts_with_at(lex.pos(), b"stream") {
66 let stream = self.read_stream(dict.clone(), lex.pos())?;
67 return Ok((id, PdfObject::Stream(stream)));
68 }
69 }
70
71 Ok((id, obj))
72 }
73
74 fn expect_keyword(&self, lex: &mut Lexer, keyword: &[u8]) -> Result<()> {
75 let pos = lex.pos();
76 if self.data[pos..].starts_with(keyword) {
77 lex.set_pos(pos + keyword.len());
78 Ok(())
79 } else {
80 Err(Error::InvalidObject(
81 pos as u64,
82 format!(
83 "expected '{}', got '{}'",
84 String::from_utf8_lossy(keyword),
85 String::from_utf8_lossy(
86 &self.data[pos..self.data.len().min(pos + keyword.len())]
87 )
88 ),
89 ))
90 }
91 }
92
93 fn starts_with_at(&self, pos: usize, prefix: &[u8]) -> bool {
94 self.data.get(pos..).is_some_and(|s| s.starts_with(prefix))
95 }
96
97 fn read_stream(&self, dict: PdfDict, keyword_pos: usize) -> Result<PdfStream> {
98 let mut pos = keyword_pos + b"stream".len();
99
100 if self.data.get(pos) == Some(&b'\r') {
102 pos += 1;
103 }
104 if self.data.get(pos) == Some(&b'\n') {
105 pos += 1;
106 }
107
108 let declared = match dict.get("Length") {
115 Some(PdfObject::Integer(n)) if *n >= 0 => Some(*n as usize),
116 _ => None,
117 };
118
119 let end = match declared {
120 Some(len)
121 if pos
122 .checked_add(len)
123 .is_some_and(|e| self.endstream_follows(e)) =>
124 {
125 pos + len
126 }
127 _ => self.scan_for_endstream(pos)?,
128 };
129
130 let length = (end - pos) as u64;
131 if length > self.limits.max_stream_bytes {
132 return Err(Error::StreamSizeLimit(self.limits.max_stream_bytes));
133 }
134
135 let stream_data = self.data[pos..end].to_vec();
136 Ok(PdfStream {
137 dict,
138 data: Arc::from(stream_data),
139 })
140 }
141
142 fn endstream_follows(&self, at: usize) -> bool {
145 let mut p = at;
146 while let Some(&b) = self.data.get(p) {
147 if matches!(b, b' ' | b'\t' | b'\r' | b'\n' | b'\x00' | b'\x0c') {
148 p += 1;
149 } else {
150 break;
151 }
152 }
153 self.data
154 .get(p..)
155 .is_some_and(|s| s.starts_with(b"endstream"))
156 }
157
158 fn scan_for_endstream(&self, pos: usize) -> Result<usize> {
163 let cap = (self.limits.max_stream_bytes as usize).saturating_add(b"endstream".len() + 2);
164 let search_end = pos.saturating_add(cap).min(self.data.len());
165 let hay = self
166 .data
167 .get(pos..search_end)
168 .ok_or(Error::UnexpectedEof(pos as u64))?;
169 let rel = hay
170 .windows(b"endstream".len())
171 .position(|w| w == b"endstream")
172 .ok_or_else(|| {
173 Error::InvalidObject(pos as u64, "stream: no endstream within size limit".into())
174 })?;
175 let mut end = pos + rel;
176 if end > pos && self.data[end - 1] == b'\n' {
178 end -= 1;
179 if end > pos && self.data[end - 1] == b'\r' {
180 end -= 1;
181 }
182 } else if end > pos && self.data[end - 1] == b'\r' {
183 end -= 1;
184 }
185 Ok(end)
186 }
187}
188
189#[cfg(test)]
190mod tests {
191 use super::*;
192
193 #[test]
194 fn parse_simple_indirect() {
195 let data = b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n";
196 let limits = ParseLimits::default();
197 let parser = ObjectParser::new(data, &limits);
198 let obj = parser.parse_indirect_at(0).unwrap();
199 match obj {
200 PdfObject::Dict(d) => {
201 assert_eq!(d.get_name("Type").unwrap(), "Catalog");
202 }
203 other => panic!("expected Dict, got {other:?}"),
204 }
205 }
206
207 #[test]
208 fn parse_stream_object() {
209 let content = b"BT /F1 12 Tf (Hello) Tj ET";
210 let obj_bytes = format!("5 0 obj\n<< /Length {} >>\nstream\n", content.len());
211 let mut data = obj_bytes.into_bytes();
212 data.extend_from_slice(content);
213 data.extend_from_slice(b"\nendstream\nendobj\n");
214
215 let limits = ParseLimits::default();
216 let parser = ObjectParser::new(&data, &limits);
217 let obj = parser.parse_indirect_at(0).unwrap();
218 match obj {
219 PdfObject::Stream(s) => {
220 assert_eq!(s.data.as_ref(), content);
221 assert_eq!(s.dict.get_i64("Length").unwrap(), content.len() as i64);
222 }
223 other => panic!("expected Stream, got {other:?}"),
224 }
225 }
226
227 #[test]
228 fn reject_oversized_stream_length() {
229 let limits = ParseLimits {
230 max_stream_bytes: 16,
231 ..Default::default()
232 };
233 let body = b"0123456789ABCDEFGHIJ"; let obj_bytes = format!("5 0 obj\n<< /Length {} >>\nstream\n", body.len());
235 let mut data = obj_bytes.into_bytes();
236 data.extend_from_slice(body);
237 data.extend_from_slice(b"\nendstream\nendobj\n");
238 let parser = ObjectParser::new(&data, &limits);
239 let err = parser.parse_indirect_at(0).unwrap_err();
240 assert!(matches!(err, Error::StreamSizeLimit(16)), "got {err:?}");
241 }
242
243 fn stream_data(data: &[u8]) -> Vec<u8> {
245 let limits = ParseLimits::default();
246 let parser = ObjectParser::new(data, &limits);
247 match parser.parse_indirect_at(0).unwrap() {
248 PdfObject::Stream(s) => s.data.to_vec(),
249 other => panic!("expected Stream, got {other:?}"),
250 }
251 }
252
253 #[test]
254 fn indirect_length_recovers_via_endstream_scan() {
255 let mut data = b"5 0 obj\n<< /Length 99 0 R >>\nstream\n".to_vec();
258 data.extend_from_slice(b"Hello, world!");
259 data.extend_from_slice(b"\nendstream\nendobj\n");
260 assert_eq!(stream_data(&data), b"Hello, world!");
261 }
262
263 #[test]
264 fn missing_length_recovers_via_endstream_scan() {
265 let mut data = b"5 0 obj\n<< /Type /Whatever >>\nstream\n".to_vec();
266 data.extend_from_slice(b"payload bytes");
267 data.extend_from_slice(b"\nendstream\nendobj\n");
268 assert_eq!(stream_data(&data), b"payload bytes");
269 }
270
271 #[test]
272 fn wrong_length_recovers_via_endstream_scan() {
273 let mut data = b"5 0 obj\n<< /Length 3 >>\nstream\n".to_vec();
276 data.extend_from_slice(b"Hello");
277 data.extend_from_slice(b"\nendstream\nendobj\n");
278 assert_eq!(stream_data(&data), b"Hello");
279 }
280
281 #[test]
282 fn negative_length_recovers_via_endstream_scan() {
283 let mut data = b"5 0 obj\n<< /Length -1 >>\nstream\n".to_vec();
284 data.extend_from_slice(b"abc");
285 data.extend_from_slice(b"\nendstream\nendobj\n");
286 assert_eq!(stream_data(&data), b"abc");
287 }
288
289 #[test]
290 fn correct_length_trusted_even_if_data_contains_endstream_bytes() {
291 let body: &[u8] = b"AAendstreamBB"; let mut data = format!("5 0 obj\n<< /Length {} >>\nstream\n", body.len()).into_bytes();
295 data.extend_from_slice(body);
296 data.extend_from_slice(b"\nendstream\nendobj\n");
297 assert_eq!(stream_data(&data), body);
298 }
299
300 #[test]
301 fn crlf_before_endstream_is_stripped_on_scan() {
302 let mut data = b"5 0 obj\n<< >>\nstream\n".to_vec();
304 data.extend_from_slice(b"data");
305 data.extend_from_slice(b"\r\nendstream\nendobj\n");
306 assert_eq!(stream_data(&data), b"data");
307 }
308
309 #[test]
310 fn parse_indirect_with_id_returns_header_id() {
311 let data = b"7 2 obj\n<< /Type /Catalog >>\nendobj\n";
312 let limits = ParseLimits::default();
313 let parser = ObjectParser::new(data, &limits);
314 let (id, obj) = parser.parse_indirect_with_id(0).unwrap();
315 assert_eq!(id, ObjectId(7, 2));
316 assert!(obj.as_dict().is_ok());
317 }
318
319 #[test]
320 fn parse_indirect_with_id_rejects_non_integer_header() {
321 let data = b"/Name 0 obj\n42\nendobj\n";
322 let limits = ParseLimits::default();
323 let parser = ObjectParser::new(data, &limits);
324 assert!(parser.parse_indirect_with_id(0).is_err());
325 }
326
327 #[test]
328 fn top_level_ref_body_parses_as_ref() {
329 let data = b"4 0 obj\n5 0 R\nendobj\n";
331 let limits = ParseLimits::default();
332 let parser = ObjectParser::new(data, &limits);
333 let obj = parser.parse_indirect_at(0).unwrap();
334 assert_eq!(obj, PdfObject::Ref(ObjectId(5, 0)));
335 }
336
337 #[test]
338 fn deeply_nested_value_in_indirect_object_errors() {
339 let limits = ParseLimits {
341 max_object_depth: 4,
342 ..Default::default()
343 };
344 let n = 20usize;
345 let mut inner = String::new();
346 for _ in 0..n {
347 inner.push('[');
348 }
349 inner.push('1');
350 for _ in 0..n {
351 inner.push(']');
352 }
353 let data = format!("1 0 obj\n{inner}\nendobj\n").into_bytes();
354 let parser = ObjectParser::new(&data, &limits);
355 let err = parser.parse_indirect_at(0).unwrap_err();
356 assert!(matches!(err, Error::RecursionLimit(4)), "got {err:?}");
357 }
358}