1use crate::error::{PdfError, Result};
7use crate::parser::{PdfDictionary, PdfObject};
8use std::io::Read;
9
10#[derive(Debug)]
12pub enum ParseEvent {
13 Header { version: String },
15 ObjectStart { id: u32, generation: u16 },
17 ObjectEnd {
19 id: u32,
20 generation: u16,
21 object: PdfObject,
22 },
23 StreamData { object_id: u32, data: Vec<u8> },
25 XRef { entries: Vec<XRefEntry> },
27 Trailer { dict: PdfDictionary },
29 EndOfFile,
31}
32
33#[derive(Debug, Clone)]
35pub struct XRefEntry {
36 pub object_number: u32,
37 pub generation: u16,
38 pub offset: u64,
39 pub in_use: bool,
40}
41
42#[derive(Debug)]
44enum ParserState {
45 Initial,
46 InObject { id: u32, generation: u16 },
47 InStream { object_id: u32 },
48 InXRef,
49 InTrailer,
50 Complete,
51}
52
53pub struct IncrementalParser {
55 state: ParserState,
56 buffer: String,
57 #[allow(dead_code)]
58 line_buffer: String,
59 events: Vec<ParseEvent>,
60}
61
62impl Default for IncrementalParser {
63 fn default() -> Self {
64 Self::new()
65 }
66}
67
68impl IncrementalParser {
69 pub fn new() -> Self {
71 Self {
72 state: ParserState::Initial,
73 buffer: String::new(),
74 line_buffer: String::new(),
75 events: Vec::new(),
76 }
77 }
78
79 pub fn feed(&mut self, data: &[u8]) -> Result<()> {
81 let text = String::from_utf8_lossy(data);
82 self.buffer.push_str(&text);
83
84 while let Some(newline_pos) = self.buffer.find('\n') {
86 let line = self.buffer[..newline_pos].trim().to_string();
87 self.buffer.drain(..=newline_pos);
88
89 self.process_line(&line)?;
90 }
91
92 Ok(())
93 }
94
95 pub fn take_events(&mut self) -> Vec<ParseEvent> {
97 std::mem::take(&mut self.events)
98 }
99
100 pub fn is_complete(&self) -> bool {
102 matches!(self.state, ParserState::Complete)
103 }
104
105 fn process_line(&mut self, line: &str) -> Result<()> {
106 match &self.state {
107 ParserState::Initial => {
108 if let Some(version_part) = line.strip_prefix("%PDF-") {
109 let version = version_part.trim().to_string();
110 self.events.push(ParseEvent::Header { version });
111 } else if let Some((id, gen)) = self.parse_object_header(line) {
112 self.state = ParserState::InObject {
113 id,
114 generation: gen,
115 };
116 self.events.push(ParseEvent::ObjectStart {
117 id,
118 generation: gen,
119 });
120 }
121 }
122 ParserState::InObject { id, generation } => {
123 if line == "endobj" {
124 let object = PdfObject::Null;
126 self.events.push(ParseEvent::ObjectEnd {
127 id: *id,
128 generation: *generation,
129 object,
130 });
131 self.state = ParserState::Initial;
132 } else if line == "stream" {
133 self.state = ParserState::InStream { object_id: *id };
134 }
135 }
136 ParserState::InStream { object_id } => {
137 if line == "endstream" {
138 self.state = ParserState::InObject {
139 id: *object_id,
140 generation: 0,
141 };
142 } else {
143 self.events.push(ParseEvent::StreamData {
144 object_id: *object_id,
145 data: line.as_bytes().to_vec(),
146 });
147 }
148 }
149 ParserState::InXRef => {
150 if line == "trailer" {
151 self.state = ParserState::InTrailer;
152 } else if let Some(_entry) = self.parse_xref_entry(line) {
153 }
155 }
156 ParserState::InTrailer => {
157 if line.starts_with("%%EOF") {
158 self.events.push(ParseEvent::EndOfFile);
159 self.state = ParserState::Complete;
160 }
161 }
162 ParserState::Complete => {
163 }
165 }
166
167 if line == "xref" {
169 self.state = ParserState::InXRef;
170 }
171
172 Ok(())
173 }
174
175 fn parse_object_header(&self, line: &str) -> Option<(u32, u16)> {
176 let parts: Vec<&str> = line.split_whitespace().collect();
177 if parts.len() >= 3 && parts[2] == "obj" {
178 let id = parts[0].parse().ok()?;
179 let gen = parts[1].parse().ok()?;
180 Some((id, gen))
181 } else {
182 None
183 }
184 }
185
186 fn parse_xref_entry(&self, line: &str) -> Option<XRefEntry> {
187 let parts: Vec<&str> = line.split_whitespace().collect();
188 if parts.len() == 3 {
189 let offset = parts[0].parse().ok()?;
190 let generation = parts[1].parse().ok()?;
191 let in_use = parts[2] == "n";
192
193 Some(XRefEntry {
194 object_number: 0, generation,
196 offset,
197 in_use,
198 })
199 } else {
200 None
201 }
202 }
203}
204
205pub fn process_incrementally<R: Read, F>(mut reader: R, mut callback: F) -> Result<()>
207where
208 F: FnMut(ParseEvent) -> Result<()>,
209{
210 let mut parser = IncrementalParser::new();
211 let mut buffer = vec![0u8; 4096];
212
213 loop {
214 match reader.read(&mut buffer) {
215 Ok(0) => break, Ok(n) => {
217 parser.feed(&buffer[..n])?;
218
219 for event in parser.take_events() {
220 callback(event)?;
221 }
222 }
223 Err(e) => return Err(PdfError::Io(e)),
224 }
225 }
226
227 Ok(())
228}
229
230#[cfg(test)]
231mod tests {
232 use super::*;
233
234 #[test]
235 fn test_incremental_parser_creation() {
236 let parser = IncrementalParser::new();
237 assert!(!parser.is_complete());
238 }
239
240 #[test]
241 fn test_parse_header() {
242 let mut parser = IncrementalParser::new();
243 parser.feed(b"%PDF-1.7\n").unwrap();
244
245 let events = parser.take_events();
246 assert_eq!(events.len(), 1);
247
248 match &events[0] {
249 ParseEvent::Header { version } => assert_eq!(version, "1.7"),
250 _ => panic!("Expected Header event"),
251 }
252 }
253
254 #[test]
255 fn test_parse_object() {
256 let mut parser = IncrementalParser::new();
257 let data = b"1 0 obj\n<< /Type /Catalog >>\nendobj\n";
258
259 parser.feed(data).unwrap();
260
261 let events = parser.take_events();
262 assert!(events.len() >= 2);
263
264 match &events[0] {
265 ParseEvent::ObjectStart { id, generation } => {
266 assert_eq!(*id, 1);
267 assert_eq!(*generation, 0);
268 }
269 _ => panic!("Expected ObjectStart event"),
270 }
271 }
272
273 #[test]
274 fn test_parse_stream() {
275 let mut parser = IncrementalParser::new();
276 parser.state = ParserState::InObject {
277 id: 1,
278 generation: 0,
279 };
280
281 let data = b"stream\nHello World\nendstream\n";
282 parser.feed(data).unwrap();
283
284 let events = parser.take_events();
285 assert!(events
286 .iter()
287 .any(|e| matches!(e, ParseEvent::StreamData { .. })));
288 }
289
290 #[test]
291 fn test_parse_eof() {
292 let mut parser = IncrementalParser::new();
293 parser.state = ParserState::InTrailer;
294
295 parser.feed(b"%%EOF\n").unwrap();
296
297 let events = parser.take_events();
298 assert_eq!(events.len(), 1);
299 assert!(matches!(events[0], ParseEvent::EndOfFile));
300 assert!(parser.is_complete());
301 }
302
303 #[test]
304 fn test_process_incrementally() {
305 use std::io::Cursor;
306
307 let data = b"%PDF-1.7\n1 0 obj\n<< >>\nendobj\n%%EOF";
308 let cursor = Cursor::new(data);
309
310 let mut event_count = 0;
311 process_incrementally(cursor, |event| {
312 event_count += 1;
313 match event {
314 ParseEvent::Header { version } => assert_eq!(version, "1.7"),
315 ParseEvent::ObjectStart { id, .. } => assert_eq!(id, 1),
316 _ => {}
317 }
318 Ok(())
319 })
320 .unwrap();
321
322 assert!(event_count > 0);
323 }
324
325 #[test]
326 fn test_parser_state_transitions() {
327 let mut parser = IncrementalParser::new();
328
329 parser.feed(b"%PDF-1.7\n").unwrap();
331
332 parser.feed(b"1 0 obj\n").unwrap();
334 assert!(matches!(parser.state, ParserState::InObject { .. }));
335
336 parser.feed(b"endobj\n").unwrap();
338 assert!(matches!(parser.state, ParserState::Initial));
339
340 parser.feed(b"xref\n").unwrap();
342 assert!(matches!(parser.state, ParserState::InXRef));
343
344 parser.feed(b"trailer\n").unwrap();
346 assert!(matches!(parser.state, ParserState::InTrailer));
347
348 parser.feed(b"%%EOF\n").unwrap();
350 assert!(parser.is_complete());
351 }
352
353 #[test]
354 fn test_incremental_parser_default() {
355 let parser = IncrementalParser::default();
356 assert!(!parser.is_complete());
357 assert!(matches!(parser.state, ParserState::Initial));
358 }
359
360 #[test]
361 fn test_parse_event_debug() {
362 let events = vec![
363 ParseEvent::Header {
364 version: "1.7".to_string(),
365 },
366 ParseEvent::ObjectStart {
367 id: 1,
368 generation: 0,
369 },
370 ParseEvent::ObjectEnd {
371 id: 1,
372 generation: 0,
373 object: PdfObject::Null,
374 },
375 ParseEvent::StreamData {
376 object_id: 1,
377 data: vec![1, 2, 3],
378 },
379 ParseEvent::XRef { entries: vec![] },
380 ParseEvent::Trailer {
381 dict: PdfDictionary::new(),
382 },
383 ParseEvent::EndOfFile,
384 ];
385
386 for event in events {
387 let debug_str = format!("{event:?}");
388 assert!(!debug_str.is_empty());
389 }
390 }
391
392 #[test]
393 fn test_xref_entry_debug_clone() {
394 let entry = XRefEntry {
395 object_number: 5,
396 generation: 2,
397 offset: 1024,
398 in_use: true,
399 };
400
401 let debug_str = format!("{entry:?}");
402 assert!(debug_str.contains("XRefEntry"));
403 assert!(debug_str.contains("5"));
404
405 let cloned = entry.clone();
406 assert_eq!(cloned.object_number, entry.object_number);
407 assert_eq!(cloned.generation, entry.generation);
408 assert_eq!(cloned.offset, entry.offset);
409 assert_eq!(cloned.in_use, entry.in_use);
410 }
411
412 #[test]
413 fn test_parser_state_debug() {
414 let states = vec![
415 ParserState::Initial,
416 ParserState::InObject {
417 id: 1,
418 generation: 0,
419 },
420 ParserState::InStream { object_id: 2 },
421 ParserState::InXRef,
422 ParserState::InTrailer,
423 ParserState::Complete,
424 ];
425
426 for state in states {
427 let debug_str = format!("{state:?}");
428 assert!(!debug_str.is_empty());
429 }
430 }
431
432 #[test]
433 fn test_feed_empty_data() {
434 let mut parser = IncrementalParser::new();
435 parser.feed(b"").unwrap();
436
437 let events = parser.take_events();
438 assert!(events.is_empty());
439 }
440
441 #[test]
442 fn test_feed_partial_lines() {
443 let mut parser = IncrementalParser::new();
444
445 parser.feed(b"%PDF-").unwrap();
447 let events1 = parser.take_events();
448 assert!(events1.is_empty());
449
450 parser.feed(b"1.7\n").unwrap();
452 let events2 = parser.take_events();
453 assert_eq!(events2.len(), 1);
454
455 match &events2[0] {
456 ParseEvent::Header { version } => assert_eq!(version, "1.7"),
457 _ => panic!("Expected Header event"),
458 }
459 }
460
461 #[test]
462 fn test_feed_multiple_lines() {
463 let mut parser = IncrementalParser::new();
464 let data = b"%PDF-1.7\n1 0 obj\nendobj\n";
465
466 parser.feed(data).unwrap();
467 let events = parser.take_events();
468
469 assert!(events.len() >= 3); }
471
472 #[test]
473 fn test_parse_object_header_valid() {
474 let parser = IncrementalParser::new();
475
476 assert_eq!(parser.parse_object_header("1 0 obj"), Some((1, 0)));
477 assert_eq!(parser.parse_object_header("42 5 obj"), Some((42, 5)));
478 assert_eq!(
479 parser.parse_object_header("999 65535 obj"),
480 Some((999, 65535))
481 );
482 }
483
484 #[test]
485 fn test_parse_object_header_invalid() {
486 let parser = IncrementalParser::new();
487
488 assert_eq!(parser.parse_object_header("1 0"), None);
489 assert_eq!(parser.parse_object_header("1 obj"), None);
490 assert_eq!(parser.parse_object_header("obj"), None);
491 assert_eq!(parser.parse_object_header("not an object"), None);
492 assert_eq!(parser.parse_object_header("abc 0 obj"), None);
493 assert_eq!(parser.parse_object_header("1 abc obj"), None);
494 }
495
496 #[test]
497 fn test_parse_xref_entry_valid() {
498 let parser = IncrementalParser::new();
499
500 let entry = parser.parse_xref_entry("0000000000 65535 f").unwrap();
501 assert_eq!(entry.offset, 0);
502 assert_eq!(entry.generation, 65535);
503 assert!(!entry.in_use);
504
505 let entry = parser.parse_xref_entry("0000001024 00000 n").unwrap();
506 assert_eq!(entry.offset, 1024);
507 assert_eq!(entry.generation, 0);
508 assert!(entry.in_use);
509 }
510
511 #[test]
512 fn test_parse_xref_entry_invalid() {
513 let parser = IncrementalParser::new();
514
515 assert!(parser.parse_xref_entry("invalid").is_none());
516 assert!(parser.parse_xref_entry("123 456").is_none());
517 assert!(parser.parse_xref_entry("abc def ghi").is_none());
518 assert!(parser.parse_xref_entry("").is_none());
519 }
520
521 #[test]
522 fn test_object_to_stream_transition() {
523 let mut parser = IncrementalParser::new();
524 parser.state = ParserState::InObject {
525 id: 3,
526 generation: 1,
527 };
528
529 parser.feed(b"stream\n").unwrap();
530 assert!(matches!(
531 parser.state,
532 ParserState::InStream { object_id: 3 }
533 ));
534 }
535
536 #[test]
537 fn test_stream_data_collection() {
538 let mut parser = IncrementalParser::new();
539 parser.state = ParserState::InStream { object_id: 5 };
540
541 parser.feed(b"line1\nline2\nendstream\n").unwrap();
542 let events = parser.take_events();
543
544 let stream_events: Vec<_> = events
545 .iter()
546 .filter(|e| matches!(e, ParseEvent::StreamData { .. }))
547 .collect();
548
549 assert_eq!(stream_events.len(), 2);
550
551 match &stream_events[0] {
552 ParseEvent::StreamData { object_id, data } => {
553 assert_eq!(*object_id, 5);
554 assert_eq!(data, b"line1");
555 }
556 _ => panic!("Expected StreamData"),
557 }
558 }
559
560 #[test]
561 fn test_stream_to_object_transition() {
562 let mut parser = IncrementalParser::new();
563 parser.state = ParserState::InStream { object_id: 7 };
564
565 parser.feed(b"endstream\n").unwrap();
566 assert!(matches!(
567 parser.state,
568 ParserState::InObject {
569 id: 7,
570 generation: 0
571 }
572 ));
573 }
574
575 #[test]
576 fn test_xref_to_trailer_transition() {
577 let mut parser = IncrementalParser::new();
578 parser.state = ParserState::InXRef;
579
580 parser.feed(b"trailer\n").unwrap();
581 assert!(matches!(parser.state, ParserState::InTrailer));
582 }
583
584 #[test]
585 fn test_ignore_input_after_completion() {
586 let mut parser = IncrementalParser::new();
587 parser.state = ParserState::Complete;
588
589 parser.feed(b"any additional input\n").unwrap();
590 let events = parser.take_events();
591 assert!(events.is_empty());
592 }
593
594 #[test]
595 fn test_process_incrementally_with_io_error() {
596 use std::io::Error;
597
598 struct ErrorReader;
599
600 impl Read for ErrorReader {
601 fn read(&mut self, _buf: &mut [u8]) -> std::io::Result<usize> {
602 Err(Error::other("Test error"))
603 }
604 }
605
606 let reader = ErrorReader;
607 let result = process_incrementally(reader, |_event| Ok(()));
608 assert!(result.is_err());
609 }
610
611 #[test]
612 fn test_process_incrementally_with_callback_error() {
613 use std::io::Cursor;
614
615 let data = b"%PDF-1.7\n";
616 let cursor = Cursor::new(data);
617
618 let result = process_incrementally(cursor, |_event| {
619 Err(PdfError::ParseError("Callback error".to_string()))
620 });
621
622 assert!(result.is_err());
623 }
624
625 #[test]
626 fn test_process_incrementally_empty_reader() {
627 use std::io::Cursor;
628
629 let data = b"";
630 let cursor = Cursor::new(data);
631
632 let mut event_count = 0;
633 process_incrementally(cursor, |_event| {
634 event_count += 1;
635 Ok(())
636 })
637 .unwrap();
638
639 assert_eq!(event_count, 0);
640 }
641
642 #[test]
643 fn test_take_events_clears_buffer() {
644 let mut parser = IncrementalParser::new();
645 parser.feed(b"%PDF-1.7\n").unwrap();
646
647 assert!(!parser.events.is_empty());
648
649 let events = parser.take_events();
650 assert_eq!(events.len(), 1);
651 assert!(parser.events.is_empty());
652
653 let events2 = parser.take_events();
655 assert!(events2.is_empty());
656 }
657
658 #[test]
659 fn test_parser_with_whitespace_handling() {
660 let mut parser = IncrementalParser::new();
661
662 parser.feed(b" %PDF-1.7 \n").unwrap();
664 let events = parser.take_events();
665
666 match &events[0] {
667 ParseEvent::Header { version } => assert_eq!(version, "1.7"),
668 _ => panic!("Expected Header event"),
669 }
670 }
671
672 #[test]
673 fn test_object_parsing_with_generation() {
674 let mut parser = IncrementalParser::new();
675 parser.feed(b"123 456 obj\n").unwrap();
676
677 let events = parser.take_events();
678 match &events[0] {
679 ParseEvent::ObjectStart { id, generation } => {
680 assert_eq!(*id, 123);
681 assert_eq!(*generation, 456);
682 }
683 _ => panic!("Expected ObjectStart event"),
684 }
685 }
686
687 #[test]
688 fn test_complete_pdf_parsing_sequence() {
689 let mut parser = IncrementalParser::new();
690
691 let pdf_content = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\nendobj\nxref\n0 1\n0000000000 65535 f\ntrailer\n<< /Size 1 >>\n%%EOF\n";
692
693 parser.feed(pdf_content).unwrap();
694 let events = parser.take_events();
695
696 assert!(events
698 .iter()
699 .any(|e| matches!(e, ParseEvent::Header { .. })));
700 assert!(events
701 .iter()
702 .any(|e| matches!(e, ParseEvent::ObjectStart { .. })));
703 assert!(events
704 .iter()
705 .any(|e| matches!(e, ParseEvent::ObjectEnd { .. })));
706 assert!(events.iter().any(|e| matches!(e, ParseEvent::EndOfFile)));
707
708 assert!(parser.is_complete());
709 }
710
711 #[test]
712 fn test_xref_state_from_any_state() {
713 let mut parser = IncrementalParser::new();
714
715 parser.state = ParserState::InObject {
717 id: 1,
718 generation: 0,
719 };
720
721 parser.feed(b"xref\n").unwrap();
723 assert!(matches!(parser.state, ParserState::InXRef));
724 }
725
726 #[test]
727 fn test_buffer_management() {
728 let mut parser = IncrementalParser::new();
729
730 parser.feed(b"partial").unwrap();
732 assert!(parser.buffer.contains("partial"));
733
734 parser.feed(b" line\n").unwrap();
736
737 assert!(!parser.buffer.contains("partial"));
739 }
740
741 #[test]
742 fn test_multiple_objects_in_sequence() {
743 let mut parser = IncrementalParser::new();
744
745 let content = b"1 0 obj\n<< >>\nendobj\n2 0 obj\n<< >>\nendobj\n";
746 parser.feed(content).unwrap();
747
748 let events = parser.take_events();
749
750 let object_starts: Vec<_> = events
751 .iter()
752 .filter_map(|e| match e {
753 ParseEvent::ObjectStart { id, generation } => Some((*id, *generation)),
754 _ => None,
755 })
756 .collect();
757
758 assert_eq!(object_starts.len(), 2);
759 assert_eq!(object_starts[0], (1, 0));
760 assert_eq!(object_starts[1], (2, 0));
761 }
762}