1use std::collections::HashMap;
12
13use rpdfium_core::error::{ParseError, PdfError};
14use rpdfium_core::fx_system::DEFAULT_MAX_ENDSTREAM_SCAN_DISTANCE;
15use rpdfium_core::{Name, ParsingMode};
16
17use crate::object::{Object, ObjectId, StreamData};
18use crate::tokenizer::{Token, Tokenizer, is_whitespace};
19
20pub fn parse_indirect_object(
24 source: &[u8],
25 offset: u64,
26 mode: ParsingMode,
27) -> Result<(ObjectId, Object), PdfError> {
28 let mut tok = Tokenizer::new_at(source, offset as usize);
29
30 let number = match tok.next_token() {
32 Some(Ok(Token::Integer(n))) if n >= 0 => n as u32,
33 _ => {
34 return Err(PdfError::Parse(ParseError::InvalidObjectHeader { offset }));
35 }
36 };
37
38 let generation = match tok.next_token() {
40 Some(Ok(Token::Integer(g))) if g >= 0 && g <= u16::MAX as i64 => g as u16,
41 _ => {
42 return Err(PdfError::Parse(ParseError::InvalidObjectHeader { offset }));
43 }
44 };
45
46 match tok.next_token() {
48 Some(Ok(Token::Keyword(ref kw))) if kw == b"obj" => {}
49 _ => {
50 return Err(PdfError::Parse(ParseError::InvalidObjectHeader { offset }));
51 }
52 }
53
54 let id = ObjectId::new(number, generation);
55 let obj = parse_direct_object_from_tokenizer(&mut tok, source, mode)?;
56
57 let obj = try_parse_stream(obj, &mut tok, source, mode)?;
59
60 tok.skip_whitespace_and_comments();
62 match tok.next_token() {
63 Some(Ok(Token::Keyword(ref kw))) if kw == b"endobj" => {}
64 _ => {
65 if mode == ParsingMode::Strict {
66 return Err(PdfError::Parse(ParseError::MissingEndobj {
67 offset: tok.position() as u64,
68 }));
69 }
70 tracing::warn!(
71 object_id = %id,
72 offset = tok.position(),
73 "missing endobj keyword"
74 );
75 }
76 }
77
78 Ok((id, obj))
79}
80
81pub fn parse_object(source: &[u8], offset: u64, mode: ParsingMode) -> Result<Object, PdfError> {
83 let mut tok = Tokenizer::new_at(source, offset as usize);
84 parse_direct_object_from_tokenizer(&mut tok, source, mode)
85}
86
87fn parse_direct_object_from_tokenizer(
89 tok: &mut Tokenizer<'_>,
90 source: &[u8],
91 mode: ParsingMode,
92) -> Result<Object, PdfError> {
93 let token = tok
94 .next_token()
95 .ok_or(PdfError::Parse(ParseError::UnexpectedEof {
96 offset: tok.position() as u64,
97 }))??;
98
99 match token {
100 Token::Null => Ok(Object::Null),
101 Token::Boolean(b) => Ok(Object::Boolean(b)),
102 Token::Integer(n) => Ok(Object::Integer(n)),
103 Token::Real(f) => Ok(Object::Real(f)),
104 Token::String(s) => Ok(Object::String(s)),
105 Token::Name(n) => Ok(Object::Name(n)),
106 Token::Ref(id) => Ok(Object::Reference(id)),
107 Token::ArrayStart => parse_array(tok, source, mode),
108 Token::DictStart => parse_dictionary(tok, source, mode),
109 Token::Keyword(ref kw) => {
110 if mode == ParsingMode::Lenient {
113 tracing::warn!(
114 keyword = %String::from_utf8_lossy(kw),
115 offset = tok.position(),
116 "unexpected keyword where object expected, treating as null"
117 );
118 Ok(Object::Null)
119 } else {
120 Err(PdfError::Parse(ParseError::UnexpectedToken {
121 offset: tok.position() as u64,
122 expected: "object".into(),
123 found: format!("keyword '{}'", String::from_utf8_lossy(kw)),
124 }))
125 }
126 }
127 other => Err(PdfError::Parse(ParseError::UnexpectedToken {
128 offset: tok.position() as u64,
129 expected: "object value".into(),
130 found: format!("{:?}", other),
131 })),
132 }
133}
134
135fn parse_array(
137 tok: &mut Tokenizer<'_>,
138 source: &[u8],
139 mode: ParsingMode,
140) -> Result<Object, PdfError> {
141 let mut elements = Vec::new();
142
143 loop {
144 tok.skip_whitespace_and_comments();
145
146 let saved = tok.position();
148 match tok.next_token() {
149 Some(Ok(Token::ArrayEnd)) => {
150 return Ok(Object::Array(elements));
151 }
152 None => {
153 return Err(PdfError::Parse(ParseError::UnexpectedEof {
154 offset: tok.position() as u64,
155 }));
156 }
157 _ => {
158 tok.set_position(saved);
160 }
161 }
162
163 let element = parse_direct_object_from_tokenizer(tok, source, mode)?;
164 elements.push(element);
165 }
166}
167
168fn parse_dictionary(
170 tok: &mut Tokenizer<'_>,
171 source: &[u8],
172 mode: ParsingMode,
173) -> Result<Object, PdfError> {
174 let mut map = HashMap::new();
175
176 loop {
177 tok.skip_whitespace_and_comments();
178
179 let saved = tok.position();
181 match tok.next_token() {
182 Some(Ok(Token::DictEnd)) => {
183 return Ok(Object::Dictionary(map));
184 }
185 None => {
186 return Err(PdfError::Parse(ParseError::UnexpectedEof {
187 offset: tok.position() as u64,
188 }));
189 }
190 Some(Ok(Token::Name(key))) => {
191 let value = parse_direct_object_from_tokenizer(tok, source, mode)?;
193 if map.contains_key(&key) {
194 if mode == ParsingMode::Strict {
195 return Err(PdfError::Parse(ParseError::DuplicateKey {
196 key: String::from_utf8_lossy(key.as_bytes()).into_owned(),
197 }));
198 }
199 tracing::warn!(
200 key = %String::from_utf8_lossy(key.as_bytes()),
201 "duplicate dictionary key, using last value"
202 );
203 }
204 map.insert(key, value);
205 }
206 Some(Ok(other)) => {
207 if mode == ParsingMode::Lenient {
208 tracing::warn!(
209 token = ?other,
210 "expected name key in dictionary, skipping"
211 );
212 continue;
214 }
215 return Err(PdfError::Parse(ParseError::UnexpectedToken {
216 offset: saved as u64,
217 expected: "name key".into(),
218 found: format!("{:?}", other),
219 }));
220 }
221 Some(Err(e)) => return Err(e),
222 }
223 }
224}
225
226fn try_parse_stream(
229 obj: Object,
230 tok: &mut Tokenizer<'_>,
231 source: &[u8],
232 mode: ParsingMode,
233) -> Result<Object, PdfError> {
234 let dict = match obj {
235 Object::Dictionary(d) => d,
236 other => return Ok(other),
237 };
238
239 let saved = tok.position();
241 tok.skip_whitespace_and_comments();
242
243 match tok.next_token() {
244 Some(Ok(Token::Keyword(ref kw))) if kw == b"stream" => {
245 let mut pos = tok.position();
247 if pos < source.len() && source[pos] == b'\r' {
248 pos += 1;
249 }
250 if pos < source.len() && source[pos] == b'\n' {
251 pos += 1;
252 }
253
254 let stream_start = pos as u64;
255 let length = resolve_stream_length(source, &dict, stream_start, mode)?;
256
257 let data = StreamData::Raw {
258 offset: stream_start,
259 length,
260 };
261
262 let end_pos = (stream_start + length) as usize;
264 tok.set_position(end_pos);
265 tok.skip_whitespace_and_comments();
266
267 match tok.next_token() {
269 Some(Ok(Token::Keyword(ref kw))) if kw == b"endstream" => {}
270 _ => {
271 if mode == ParsingMode::Strict {
272 return Err(PdfError::Parse(ParseError::MissingEndstream {
273 offset: tok.position() as u64,
274 }));
275 }
276 tracing::warn!(offset = tok.position(), "missing endstream keyword");
277 }
278 }
279
280 Ok(Object::Stream { dict, data })
281 }
282 _ => {
283 tok.set_position(saved);
285 Ok(Object::Dictionary(dict))
286 }
287 }
288}
289
290fn resolve_stream_length(
296 source: &[u8],
297 dict: &HashMap<Name, Object>,
298 stream_start: u64,
299 mode: ParsingMode,
300) -> Result<u64, PdfError> {
301 if let Some(Object::Integer(len)) = dict.get(&Name::length()) {
303 if *len >= 0 {
304 let len = *len as u64;
305 if verify_endstream(source, stream_start + len) {
307 return Ok(len);
308 }
309 tracing::warn!(
310 expected_length = len,
311 stream_start = stream_start,
312 "endstream not found at expected offset"
313 );
314 }
315 }
316
317 scan_for_endstream(
319 source,
320 stream_start,
321 DEFAULT_MAX_ENDSTREAM_SCAN_DISTANCE,
322 mode,
323 )
324}
325
326fn verify_endstream(source: &[u8], expected_end: u64) -> bool {
328 let pos = expected_end as usize;
329 let marker = b"endstream";
330
331 let mut check_pos = pos;
333 while check_pos < source.len() && is_whitespace(source[check_pos]) {
334 check_pos += 1;
335 }
336
337 if check_pos + marker.len() <= source.len() {
338 &source[check_pos..check_pos + marker.len()] == marker
339 } else {
340 false
341 }
342}
343
344fn scan_for_endstream(
346 source: &[u8],
347 stream_start: u64,
348 max_distance: u64,
349 mode: ParsingMode,
350) -> Result<u64, PdfError> {
351 let start = stream_start as usize;
352 let limit = source
353 .len()
354 .min(start.saturating_add(max_distance as usize));
355 let marker = b"endstream";
356
357 let search_slice = &source[start..limit];
358
359 for i in 0..search_slice.len().saturating_sub(marker.len()) {
361 if &search_slice[i..i + marker.len()] == marker {
362 let mut end = i;
365 while end > 0 && is_whitespace(search_slice[end - 1]) {
366 end -= 1;
367 }
368 return Ok(end as u64);
369 }
370 }
371
372 if mode == ParsingMode::Lenient {
373 tracing::warn!(
374 stream_start = stream_start,
375 "endstream not found within scan distance, using remainder"
376 );
377 Ok((limit - start) as u64)
378 } else {
379 Err(PdfError::EndstreamScanFailed)
380 }
381}
382
383#[cfg(test)]
384mod tests {
385 use super::*;
386
387 #[test]
388 fn test_parse_null_object() {
389 let obj = parse_object(b"null", 0, ParsingMode::Strict).unwrap();
390 assert!(obj.is_null());
391 }
392
393 #[test]
394 fn test_parse_boolean_object() {
395 let obj = parse_object(b"true", 0, ParsingMode::Strict).unwrap();
396 assert_eq!(obj.as_bool(), Some(true));
397 }
398
399 #[test]
400 fn test_parse_integer_object() {
401 let obj = parse_object(b"42", 0, ParsingMode::Strict).unwrap();
402 assert_eq!(obj.as_i64(), Some(42));
403 }
404
405 #[test]
406 #[allow(clippy::approx_constant)]
407 fn test_parse_real_object() {
408 let obj = parse_object(b"3.14", 0, ParsingMode::Strict).unwrap();
409 assert_eq!(obj.as_f64(), Some(3.14));
410 }
411
412 #[test]
413 fn test_parse_string_object() {
414 let obj = parse_object(b"(Hello)", 0, ParsingMode::Strict).unwrap();
415 assert_eq!(obj.as_string().unwrap().as_bytes(), b"Hello");
416 }
417
418 #[test]
419 fn test_parse_name_object() {
420 let obj = parse_object(b"/Type", 0, ParsingMode::Strict).unwrap();
421 assert!(obj.as_name().is_some());
422 }
423
424 #[test]
425 fn test_parse_array_object() {
426 let obj = parse_object(b"[1 2 3]", 0, ParsingMode::Strict).unwrap();
427 let arr = obj.as_array().unwrap();
428 assert_eq!(arr.len(), 3);
429 assert_eq!(arr[0].as_i64(), Some(1));
430 assert_eq!(arr[2].as_i64(), Some(3));
431 }
432
433 #[test]
434 fn test_parse_nested_array() {
435 let obj = parse_object(b"[[1 2] [3 4]]", 0, ParsingMode::Strict).unwrap();
436 let arr = obj.as_array().unwrap();
437 assert_eq!(arr.len(), 2);
438 assert_eq!(arr[0].as_array().unwrap().len(), 2);
439 }
440
441 #[test]
442 fn test_parse_dictionary_object() {
443 let obj =
444 parse_object(b"<< /Type /Catalog /Pages 3 0 R >>", 0, ParsingMode::Strict).unwrap();
445 let dict = obj.as_dict().unwrap();
446 assert!(dict.contains_key(&Name::r#type()));
447 }
448
449 #[test]
450 fn test_parse_reference_in_dict() {
451 let obj = parse_object(b"<< /Pages 3 0 R >>", 0, ParsingMode::Strict).unwrap();
452 let dict = obj.as_dict().unwrap();
453 let pages = dict.get(&Name::pages()).unwrap();
454 assert_eq!(pages.as_reference(), Some(ObjectId::new(3, 0)));
455 }
456
457 #[test]
458 fn test_parse_indirect_object_simple() {
459 let source = b"1 0 obj\n42\nendobj";
460 let (id, obj) = parse_indirect_object(source, 0, ParsingMode::Strict).unwrap();
461 assert_eq!(id, ObjectId::new(1, 0));
462 assert_eq!(obj.as_i64(), Some(42));
463 }
464
465 #[test]
466 fn test_parse_indirect_object_dict() {
467 let source = b"5 0 obj\n<< /Type /Page >>\nendobj";
468 let (id, obj) = parse_indirect_object(source, 0, ParsingMode::Strict).unwrap();
469 assert_eq!(id, ObjectId::new(5, 0));
470 assert!(obj.as_dict().is_some());
471 }
472
473 #[test]
474 fn test_parse_stream_object() {
475 let source = b"1 0 obj\n<< /Length 5 >>\nstream\nHelloendstream\nendobj";
476 let (id, obj) = parse_indirect_object(source, 0, ParsingMode::Strict).unwrap();
477 assert_eq!(id, ObjectId::new(1, 0));
478 match &obj {
479 Object::Stream { dict, data } => {
480 assert!(dict.contains_key(&Name::length()));
481 match data {
482 StreamData::Raw { length, .. } => assert_eq!(*length, 5),
483 StreamData::Decoded { .. } => panic!("expected raw stream"),
484 }
485 }
486 _ => panic!("expected stream object"),
487 }
488 }
489
490 #[test]
491 fn test_parse_stream_with_scan_fallback() {
492 let source = b"1 0 obj\n<< >>\nstream\nSome data here\nendstream\nendobj";
494 let (_, obj) = parse_indirect_object(source, 0, ParsingMode::Lenient).unwrap();
495 match &obj {
496 Object::Stream { data, .. } => match data {
497 StreamData::Raw { length, .. } => {
498 assert!(*length > 0);
499 }
500 StreamData::Decoded { .. } => panic!("expected raw stream"),
501 },
502 _ => panic!("expected stream object"),
503 }
504 }
505
506 #[test]
507 fn test_verify_endstream_function() {
508 let source = b"Hello\nendstream";
509 assert!(verify_endstream(source, 5));
510 }
511
512 #[test]
513 fn test_verify_endstream_with_whitespace() {
514 let source = b"Hello \n endstream";
515 assert!(verify_endstream(source, 5));
516 }
517
518 #[test]
519 fn test_missing_endobj_lenient() {
520 let source = b"1 0 obj\n42\n";
521 let result = parse_indirect_object(source, 0, ParsingMode::Lenient);
522 assert!(result.is_ok());
523 }
524
525 #[test]
526 fn test_missing_endobj_strict() {
527 let source = b"1 0 obj\n42\n";
528 let result = parse_indirect_object(source, 0, ParsingMode::Strict);
529 assert!(result.is_err());
530 }
531
532 #[test]
533 fn test_parse_empty_array() {
534 let obj = parse_object(b"[]", 0, ParsingMode::Strict).unwrap();
535 assert_eq!(obj.as_array().unwrap().len(), 0);
536 }
537
538 #[test]
539 fn test_parse_empty_dict() {
540 let obj = parse_object(b"<< >>", 0, ParsingMode::Strict).unwrap();
541 assert_eq!(obj.as_dict().unwrap().len(), 0);
542 }
543
544 #[test]
545 fn test_duplicate_key_strict() {
546 let result = parse_object(b"<< /Type /A /Type /B >>", 0, ParsingMode::Strict);
547 assert!(result.is_err());
548 }
549
550 #[test]
551 fn test_duplicate_key_lenient() {
552 let obj = parse_object(b"<< /Type /A /Type /B >>", 0, ParsingMode::Lenient).unwrap();
553 let dict = obj.as_dict().unwrap();
554 assert!(dict.contains_key(&Name::r#type()));
556 }
557
558 #[test]
564 fn test_parse_empty_input() {
565 let result = parse_object(b"", 0, ParsingMode::Strict);
566 assert!(result.is_err());
567 }
568
569 #[test]
571 fn test_parse_whitespace_only() {
572 let result = parse_object(b" \n\t ", 0, ParsingMode::Strict);
573 assert!(result.is_err());
574 }
575
576 #[test]
578 fn test_parse_unterminated_array() {
579 let result = parse_object(b"[1 2 3", 0, ParsingMode::Strict);
580 assert!(result.is_err());
581 }
582
583 #[test]
585 fn test_parse_unterminated_dict() {
586 let result = parse_object(b"<< /Type /Catalog", 0, ParsingMode::Strict);
587 assert!(result.is_err());
588 }
589
590 #[test]
592 #[allow(clippy::approx_constant)]
593 fn test_parse_array_mixed_types() {
594 let obj = parse_object(
595 b"[1 3.14 true false null (hello) /Name 5 0 R]",
596 0,
597 ParsingMode::Strict,
598 )
599 .unwrap();
600 let arr = obj.as_array().unwrap();
601 assert_eq!(arr.len(), 8);
602 assert_eq!(arr[0].as_i64(), Some(1));
603 assert_eq!(arr[1].as_f64(), Some(3.14));
604 assert_eq!(arr[2].as_bool(), Some(true));
605 assert_eq!(arr[3].as_bool(), Some(false));
606 assert!(arr[4].is_null());
607 assert!(arr[5].as_string().is_some());
608 assert!(arr[6].as_name().is_some());
609 assert_eq!(arr[7].as_reference(), Some(ObjectId::new(5, 0)));
610 }
611
612 #[test]
614 fn test_parse_deeply_nested_array() {
615 let source = b"[[[[[[42]]]]]]";
616 let obj = parse_object(source, 0, ParsingMode::Strict).unwrap();
617 let mut current = &obj;
619 for _ in 0..5 {
620 current = ¤t.as_array().unwrap()[0];
621 }
622 let inner = current.as_array().unwrap();
623 assert_eq!(inner[0].as_i64(), Some(42));
624 }
625
626 #[test]
628 fn test_parse_dict_non_name_key_strict() {
629 let result = parse_object(b"<< 42 /Value >>", 0, ParsingMode::Strict);
630 assert!(result.is_err());
631 }
632
633 #[test]
639 fn test_parse_dict_non_name_key_lenient() {
640 let result = parse_object(b"<< 42 /Key /Value >>", 0, ParsingMode::Lenient);
643 assert!(result.is_ok());
644 let dict = result.unwrap().as_dict().unwrap().clone();
645 let key = Name::from_bytes(b"Key".to_vec());
646 assert!(dict.contains_key(&key));
647 }
648
649 #[test]
651 fn test_parse_at_offset() {
652 let source = b"garbage42";
653 let obj = parse_object(source, 7, ParsingMode::Strict).unwrap();
654 assert_eq!(obj.as_i64(), Some(42));
655 }
656
657 #[test]
659 fn test_parse_stream_with_exact_length() {
660 let content = b"Hello World!"; let source = format!(
662 "1 0 obj\n<< /Length {} >>\nstream\n{}endstream\nendobj",
663 content.len(),
664 String::from_utf8_lossy(content)
665 );
666 let (id, obj) = parse_indirect_object(source.as_bytes(), 0, ParsingMode::Strict).unwrap();
667 assert_eq!(id, ObjectId::new(1, 0));
668 match &obj {
669 Object::Stream { data, .. } => match data {
670 StreamData::Raw { length, .. } => assert_eq!(*length, 12),
671 StreamData::Decoded { .. } => panic!("expected raw stream"),
672 },
673 _ => panic!("expected stream object"),
674 }
675 }
676
677 #[test]
679 fn test_parse_indirect_object_with_generation() {
680 let source = b"5 3 obj\nnull\nendobj";
681 let (id, obj) = parse_indirect_object(source, 0, ParsingMode::Strict).unwrap();
682 assert_eq!(id, ObjectId::new(5, 3));
683 assert!(obj.is_null());
684 }
685
686 #[test]
688 fn test_parse_indirect_object_negative_number() {
689 let source = b"-1 0 obj\n42\nendobj";
690 let result = parse_indirect_object(source, 0, ParsingMode::Strict);
691 assert!(result.is_err());
692 }
693
694 #[test]
696 fn test_parse_bare_keyword_strict() {
697 let result = parse_object(b"endobj", 0, ParsingMode::Strict);
698 assert!(result.is_err());
699 }
700
701 #[test]
703 fn test_parse_bare_keyword_lenient() {
704 let obj = parse_object(b"endobj", 0, ParsingMode::Lenient).unwrap();
705 assert!(obj.is_null());
706 }
707}