oxidize_pdf/streaming/
incremental_parser.rs

1//! Incremental PDF parser for streaming operations
2//!
3//! Parses PDF objects incrementally as they are encountered in the stream,
4//! enabling processing of very large PDFs with minimal memory usage.
5
6use crate::error::{PdfError, Result};
7use crate::parser::{PdfDictionary, PdfObject};
8use std::io::Read;
9
10/// Events emitted during incremental parsing
11#[derive(Debug)]
12pub enum ParseEvent {
13    /// PDF header found
14    Header { version: String },
15    /// Object definition started
16    ObjectStart { id: u32, generation: u16 },
17    /// Object definition completed
18    ObjectEnd {
19        id: u32,
20        generation: u16,
21        object: PdfObject,
22    },
23    /// Stream data chunk
24    StreamData { object_id: u32, data: Vec<u8> },
25    /// Cross-reference table found
26    XRef { entries: Vec<XRefEntry> },
27    /// Trailer dictionary found
28    Trailer { dict: PdfDictionary },
29    /// End of file marker
30    EndOfFile,
31}
32
33/// Cross-reference table entry
34#[derive(Debug, Clone)]
35pub struct XRefEntry {
36    pub object_number: u32,
37    pub generation: u16,
38    pub offset: u64,
39    pub in_use: bool,
40}
41
42/// State of the incremental parser
43#[derive(Debug)]
44enum ParserState {
45    Initial,
46    InObject { id: u32, generation: u16 },
47    InStream { object_id: u32 },
48    InXRef,
49    InTrailer,
50    Complete,
51}
52
53/// Incremental PDF parser
54pub struct IncrementalParser {
55    state: ParserState,
56    buffer: String,
57    #[allow(dead_code)]
58    line_buffer: String,
59    events: Vec<ParseEvent>,
60}
61
62impl Default for IncrementalParser {
63    fn default() -> Self {
64        Self::new()
65    }
66}
67
68impl IncrementalParser {
69    /// Create a new incremental parser
70    pub fn new() -> Self {
71        Self {
72            state: ParserState::Initial,
73            buffer: String::new(),
74            line_buffer: String::new(),
75            events: Vec::new(),
76        }
77    }
78
79    /// Feed data to the parser
80    pub fn feed(&mut self, data: &[u8]) -> Result<()> {
81        let text = String::from_utf8_lossy(data);
82        self.buffer.push_str(&text);
83
84        // Process complete lines
85        while let Some(newline_pos) = self.buffer.find('\n') {
86            let line = self.buffer[..newline_pos].trim().to_string();
87            self.buffer.drain(..=newline_pos);
88
89            self.process_line(&line)?;
90        }
91
92        Ok(())
93    }
94
95    /// Get pending events
96    pub fn take_events(&mut self) -> Vec<ParseEvent> {
97        std::mem::take(&mut self.events)
98    }
99
100    /// Check if parsing is complete
101    pub fn is_complete(&self) -> bool {
102        matches!(self.state, ParserState::Complete)
103    }
104
105    fn process_line(&mut self, line: &str) -> Result<()> {
106        match &self.state {
107            ParserState::Initial => {
108                if let Some(version_part) = line.strip_prefix("%PDF-") {
109                    let version = version_part.trim().to_string();
110                    self.events.push(ParseEvent::Header { version });
111                } else if let Some((id, gen)) = self.parse_object_header(line) {
112                    self.state = ParserState::InObject {
113                        id,
114                        generation: gen,
115                    };
116                    self.events.push(ParseEvent::ObjectStart {
117                        id,
118                        generation: gen,
119                    });
120                }
121            }
122            ParserState::InObject { id, generation } => {
123                if line == "endobj" {
124                    // Create mock object for demonstration
125                    let object = PdfObject::Null;
126                    self.events.push(ParseEvent::ObjectEnd {
127                        id: *id,
128                        generation: *generation,
129                        object,
130                    });
131                    self.state = ParserState::Initial;
132                } else if line == "stream" {
133                    self.state = ParserState::InStream { object_id: *id };
134                }
135            }
136            ParserState::InStream { object_id } => {
137                if line == "endstream" {
138                    self.state = ParserState::InObject {
139                        id: *object_id,
140                        generation: 0,
141                    };
142                } else {
143                    self.events.push(ParseEvent::StreamData {
144                        object_id: *object_id,
145                        data: line.as_bytes().to_vec(),
146                    });
147                }
148            }
149            ParserState::InXRef => {
150                if line == "trailer" {
151                    self.state = ParserState::InTrailer;
152                } else if let Some(_entry) = self.parse_xref_entry(line) {
153                    // Collect entries
154                }
155            }
156            ParserState::InTrailer => {
157                if line.starts_with("%%EOF") {
158                    self.events.push(ParseEvent::EndOfFile);
159                    self.state = ParserState::Complete;
160                }
161            }
162            ParserState::Complete => {
163                // Ignore additional input
164            }
165        }
166
167        // Check for state transitions
168        if line == "xref" {
169            self.state = ParserState::InXRef;
170        }
171
172        Ok(())
173    }
174
175    fn parse_object_header(&self, line: &str) -> Option<(u32, u16)> {
176        let parts: Vec<&str> = line.split_whitespace().collect();
177        if parts.len() >= 3 && parts[2] == "obj" {
178            let id = parts[0].parse().ok()?;
179            let gen = parts[1].parse().ok()?;
180            Some((id, gen))
181        } else {
182            None
183        }
184    }
185
186    fn parse_xref_entry(&self, line: &str) -> Option<XRefEntry> {
187        let parts: Vec<&str> = line.split_whitespace().collect();
188        if parts.len() == 3 {
189            let offset = parts[0].parse().ok()?;
190            let generation = parts[1].parse().ok()?;
191            let in_use = parts[2] == "n";
192
193            Some(XRefEntry {
194                object_number: 0, // Would be set by context
195                generation,
196                offset,
197                in_use,
198            })
199        } else {
200            None
201        }
202    }
203}
204
205/// Process a reader incrementally
206pub fn process_incrementally<R: Read, F>(mut reader: R, mut callback: F) -> Result<()>
207where
208    F: FnMut(ParseEvent) -> Result<()>,
209{
210    let mut parser = IncrementalParser::new();
211    let mut buffer = vec![0u8; 4096];
212
213    loop {
214        match reader.read(&mut buffer) {
215            Ok(0) => break, // EOF
216            Ok(n) => {
217                parser.feed(&buffer[..n])?;
218
219                for event in parser.take_events() {
220                    callback(event)?;
221                }
222            }
223            Err(e) => return Err(PdfError::Io(e)),
224        }
225    }
226
227    Ok(())
228}
229
230#[cfg(test)]
231mod tests {
232    use super::*;
233
234    #[test]
235    fn test_incremental_parser_creation() {
236        let parser = IncrementalParser::new();
237        assert!(!parser.is_complete());
238    }
239
240    #[test]
241    fn test_parse_header() {
242        let mut parser = IncrementalParser::new();
243        parser.feed(b"%PDF-1.7\n").unwrap();
244
245        let events = parser.take_events();
246        assert_eq!(events.len(), 1);
247
248        match &events[0] {
249            ParseEvent::Header { version } => assert_eq!(version, "1.7"),
250            _ => panic!("Expected Header event"),
251        }
252    }
253
254    #[test]
255    fn test_parse_object() {
256        let mut parser = IncrementalParser::new();
257        let data = b"1 0 obj\n<< /Type /Catalog >>\nendobj\n";
258
259        parser.feed(data).unwrap();
260
261        let events = parser.take_events();
262        assert!(events.len() >= 2);
263
264        match &events[0] {
265            ParseEvent::ObjectStart { id, generation } => {
266                assert_eq!(*id, 1);
267                assert_eq!(*generation, 0);
268            }
269            _ => panic!("Expected ObjectStart event"),
270        }
271    }
272
273    #[test]
274    fn test_parse_stream() {
275        let mut parser = IncrementalParser::new();
276        parser.state = ParserState::InObject {
277            id: 1,
278            generation: 0,
279        };
280
281        let data = b"stream\nHello World\nendstream\n";
282        parser.feed(data).unwrap();
283
284        let events = parser.take_events();
285        assert!(events
286            .iter()
287            .any(|e| matches!(e, ParseEvent::StreamData { .. })));
288    }
289
290    #[test]
291    fn test_parse_eof() {
292        let mut parser = IncrementalParser::new();
293        parser.state = ParserState::InTrailer;
294
295        parser.feed(b"%%EOF\n").unwrap();
296
297        let events = parser.take_events();
298        assert_eq!(events.len(), 1);
299        assert!(matches!(events[0], ParseEvent::EndOfFile));
300        assert!(parser.is_complete());
301    }
302
303    #[test]
304    fn test_process_incrementally() {
305        use std::io::Cursor;
306
307        let data = b"%PDF-1.7\n1 0 obj\n<< >>\nendobj\n%%EOF";
308        let cursor = Cursor::new(data);
309
310        let mut event_count = 0;
311        process_incrementally(cursor, |event| {
312            event_count += 1;
313            match event {
314                ParseEvent::Header { version } => assert_eq!(version, "1.7"),
315                ParseEvent::ObjectStart { id, .. } => assert_eq!(id, 1),
316                _ => {}
317            }
318            Ok(())
319        })
320        .unwrap();
321
322        assert!(event_count > 0);
323    }
324
325    #[test]
326    fn test_parser_state_transitions() {
327        let mut parser = IncrementalParser::new();
328
329        // Initial -> Header
330        parser.feed(b"%PDF-1.7\n").unwrap();
331
332        // Header -> Object
333        parser.feed(b"1 0 obj\n").unwrap();
334        assert!(matches!(parser.state, ParserState::InObject { .. }));
335
336        // Object -> Initial
337        parser.feed(b"endobj\n").unwrap();
338        assert!(matches!(parser.state, ParserState::Initial));
339
340        // Initial -> XRef
341        parser.feed(b"xref\n").unwrap();
342        assert!(matches!(parser.state, ParserState::InXRef));
343
344        // XRef -> Trailer
345        parser.feed(b"trailer\n").unwrap();
346        assert!(matches!(parser.state, ParserState::InTrailer));
347
348        // Trailer -> Complete
349        parser.feed(b"%%EOF\n").unwrap();
350        assert!(parser.is_complete());
351    }
352
353    #[test]
354    fn test_incremental_parser_default() {
355        let parser = IncrementalParser::default();
356        assert!(!parser.is_complete());
357        assert!(matches!(parser.state, ParserState::Initial));
358    }
359
360    #[test]
361    fn test_parse_event_debug() {
362        let events = vec![
363            ParseEvent::Header {
364                version: "1.7".to_string(),
365            },
366            ParseEvent::ObjectStart {
367                id: 1,
368                generation: 0,
369            },
370            ParseEvent::ObjectEnd {
371                id: 1,
372                generation: 0,
373                object: PdfObject::Null,
374            },
375            ParseEvent::StreamData {
376                object_id: 1,
377                data: vec![1, 2, 3],
378            },
379            ParseEvent::XRef { entries: vec![] },
380            ParseEvent::Trailer {
381                dict: PdfDictionary::new(),
382            },
383            ParseEvent::EndOfFile,
384        ];
385
386        for event in events {
387            let debug_str = format!("{event:?}");
388            assert!(!debug_str.is_empty());
389        }
390    }
391
392    #[test]
393    fn test_xref_entry_debug_clone() {
394        let entry = XRefEntry {
395            object_number: 5,
396            generation: 2,
397            offset: 1024,
398            in_use: true,
399        };
400
401        let debug_str = format!("{entry:?}");
402        assert!(debug_str.contains("XRefEntry"));
403        assert!(debug_str.contains("5"));
404
405        let cloned = entry.clone();
406        assert_eq!(cloned.object_number, entry.object_number);
407        assert_eq!(cloned.generation, entry.generation);
408        assert_eq!(cloned.offset, entry.offset);
409        assert_eq!(cloned.in_use, entry.in_use);
410    }
411
412    #[test]
413    fn test_parser_state_debug() {
414        let states = vec![
415            ParserState::Initial,
416            ParserState::InObject {
417                id: 1,
418                generation: 0,
419            },
420            ParserState::InStream { object_id: 2 },
421            ParserState::InXRef,
422            ParserState::InTrailer,
423            ParserState::Complete,
424        ];
425
426        for state in states {
427            let debug_str = format!("{state:?}");
428            assert!(!debug_str.is_empty());
429        }
430    }
431
432    #[test]
433    fn test_feed_empty_data() {
434        let mut parser = IncrementalParser::new();
435        parser.feed(b"").unwrap();
436
437        let events = parser.take_events();
438        assert!(events.is_empty());
439    }
440
441    #[test]
442    fn test_feed_partial_lines() {
443        let mut parser = IncrementalParser::new();
444
445        // Feed partial line
446        parser.feed(b"%PDF-").unwrap();
447        let events1 = parser.take_events();
448        assert!(events1.is_empty());
449
450        // Complete the line
451        parser.feed(b"1.7\n").unwrap();
452        let events2 = parser.take_events();
453        assert_eq!(events2.len(), 1);
454
455        match &events2[0] {
456            ParseEvent::Header { version } => assert_eq!(version, "1.7"),
457            _ => panic!("Expected Header event"),
458        }
459    }
460
461    #[test]
462    fn test_feed_multiple_lines() {
463        let mut parser = IncrementalParser::new();
464        let data = b"%PDF-1.7\n1 0 obj\nendobj\n";
465
466        parser.feed(data).unwrap();
467        let events = parser.take_events();
468
469        assert!(events.len() >= 3); // Header, ObjectStart, ObjectEnd
470    }
471
472    #[test]
473    fn test_parse_object_header_valid() {
474        let parser = IncrementalParser::new();
475
476        assert_eq!(parser.parse_object_header("1 0 obj"), Some((1, 0)));
477        assert_eq!(parser.parse_object_header("42 5 obj"), Some((42, 5)));
478        assert_eq!(
479            parser.parse_object_header("999 65535 obj"),
480            Some((999, 65535))
481        );
482    }
483
484    #[test]
485    fn test_parse_object_header_invalid() {
486        let parser = IncrementalParser::new();
487
488        assert_eq!(parser.parse_object_header("1 0"), None);
489        assert_eq!(parser.parse_object_header("1 obj"), None);
490        assert_eq!(parser.parse_object_header("obj"), None);
491        assert_eq!(parser.parse_object_header("not an object"), None);
492        assert_eq!(parser.parse_object_header("abc 0 obj"), None);
493        assert_eq!(parser.parse_object_header("1 abc obj"), None);
494    }
495
496    #[test]
497    fn test_parse_xref_entry_valid() {
498        let parser = IncrementalParser::new();
499
500        let entry = parser.parse_xref_entry("0000000000 65535 f").unwrap();
501        assert_eq!(entry.offset, 0);
502        assert_eq!(entry.generation, 65535);
503        assert!(!entry.in_use);
504
505        let entry = parser.parse_xref_entry("0000001024 00000 n").unwrap();
506        assert_eq!(entry.offset, 1024);
507        assert_eq!(entry.generation, 0);
508        assert!(entry.in_use);
509    }
510
511    #[test]
512    fn test_parse_xref_entry_invalid() {
513        let parser = IncrementalParser::new();
514
515        assert!(parser.parse_xref_entry("invalid").is_none());
516        assert!(parser.parse_xref_entry("123 456").is_none());
517        assert!(parser.parse_xref_entry("abc def ghi").is_none());
518        assert!(parser.parse_xref_entry("").is_none());
519    }
520
521    #[test]
522    fn test_object_to_stream_transition() {
523        let mut parser = IncrementalParser::new();
524        parser.state = ParserState::InObject {
525            id: 3,
526            generation: 1,
527        };
528
529        parser.feed(b"stream\n").unwrap();
530        assert!(matches!(
531            parser.state,
532            ParserState::InStream { object_id: 3 }
533        ));
534    }
535
536    #[test]
537    fn test_stream_data_collection() {
538        let mut parser = IncrementalParser::new();
539        parser.state = ParserState::InStream { object_id: 5 };
540
541        parser.feed(b"line1\nline2\nendstream\n").unwrap();
542        let events = parser.take_events();
543
544        let stream_events: Vec<_> = events
545            .iter()
546            .filter(|e| matches!(e, ParseEvent::StreamData { .. }))
547            .collect();
548
549        assert_eq!(stream_events.len(), 2);
550
551        match &stream_events[0] {
552            ParseEvent::StreamData { object_id, data } => {
553                assert_eq!(*object_id, 5);
554                assert_eq!(data, b"line1");
555            }
556            _ => panic!("Expected StreamData"),
557        }
558    }
559
560    #[test]
561    fn test_stream_to_object_transition() {
562        let mut parser = IncrementalParser::new();
563        parser.state = ParserState::InStream { object_id: 7 };
564
565        parser.feed(b"endstream\n").unwrap();
566        assert!(matches!(
567            parser.state,
568            ParserState::InObject {
569                id: 7,
570                generation: 0
571            }
572        ));
573    }
574
575    #[test]
576    fn test_xref_to_trailer_transition() {
577        let mut parser = IncrementalParser::new();
578        parser.state = ParserState::InXRef;
579
580        parser.feed(b"trailer\n").unwrap();
581        assert!(matches!(parser.state, ParserState::InTrailer));
582    }
583
584    #[test]
585    fn test_ignore_input_after_completion() {
586        let mut parser = IncrementalParser::new();
587        parser.state = ParserState::Complete;
588
589        parser.feed(b"any additional input\n").unwrap();
590        let events = parser.take_events();
591        assert!(events.is_empty());
592    }
593
594    #[test]
595    fn test_process_incrementally_with_io_error() {
596        use std::io::Error;
597
598        struct ErrorReader;
599
600        impl Read for ErrorReader {
601            fn read(&mut self, _buf: &mut [u8]) -> std::io::Result<usize> {
602                Err(Error::other("Test error"))
603            }
604        }
605
606        let reader = ErrorReader;
607        let result = process_incrementally(reader, |_event| Ok(()));
608        assert!(result.is_err());
609    }
610
611    #[test]
612    fn test_process_incrementally_with_callback_error() {
613        use std::io::Cursor;
614
615        let data = b"%PDF-1.7\n";
616        let cursor = Cursor::new(data);
617
618        let result = process_incrementally(cursor, |_event| {
619            Err(PdfError::ParseError("Callback error".to_string()))
620        });
621
622        assert!(result.is_err());
623    }
624
625    #[test]
626    fn test_process_incrementally_empty_reader() {
627        use std::io::Cursor;
628
629        let data = b"";
630        let cursor = Cursor::new(data);
631
632        let mut event_count = 0;
633        process_incrementally(cursor, |_event| {
634            event_count += 1;
635            Ok(())
636        })
637        .unwrap();
638
639        assert_eq!(event_count, 0);
640    }
641
642    #[test]
643    fn test_take_events_clears_buffer() {
644        let mut parser = IncrementalParser::new();
645        parser.feed(b"%PDF-1.7\n").unwrap();
646
647        assert!(!parser.events.is_empty());
648
649        let events = parser.take_events();
650        assert_eq!(events.len(), 1);
651        assert!(parser.events.is_empty());
652
653        // Subsequent call should return empty
654        let events2 = parser.take_events();
655        assert!(events2.is_empty());
656    }
657
658    #[test]
659    fn test_parser_with_whitespace_handling() {
660        let mut parser = IncrementalParser::new();
661
662        // Test with extra whitespace
663        parser.feed(b"   %PDF-1.7   \n").unwrap();
664        let events = parser.take_events();
665
666        match &events[0] {
667            ParseEvent::Header { version } => assert_eq!(version, "1.7"),
668            _ => panic!("Expected Header event"),
669        }
670    }
671
672    #[test]
673    fn test_object_parsing_with_generation() {
674        let mut parser = IncrementalParser::new();
675        parser.feed(b"123 456 obj\n").unwrap();
676
677        let events = parser.take_events();
678        match &events[0] {
679            ParseEvent::ObjectStart { id, generation } => {
680                assert_eq!(*id, 123);
681                assert_eq!(*generation, 456);
682            }
683            _ => panic!("Expected ObjectStart event"),
684        }
685    }
686
687    #[test]
688    fn test_complete_pdf_parsing_sequence() {
689        let mut parser = IncrementalParser::new();
690
691        let pdf_content = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\nendobj\nxref\n0 1\n0000000000 65535 f\ntrailer\n<< /Size 1 >>\n%%EOF\n";
692
693        parser.feed(pdf_content).unwrap();
694        let events = parser.take_events();
695
696        // Should have header, object start, object end, and eof events
697        assert!(events
698            .iter()
699            .any(|e| matches!(e, ParseEvent::Header { .. })));
700        assert!(events
701            .iter()
702            .any(|e| matches!(e, ParseEvent::ObjectStart { .. })));
703        assert!(events
704            .iter()
705            .any(|e| matches!(e, ParseEvent::ObjectEnd { .. })));
706        assert!(events.iter().any(|e| matches!(e, ParseEvent::EndOfFile)));
707
708        assert!(parser.is_complete());
709    }
710
711    #[test]
712    fn test_xref_state_from_any_state() {
713        let mut parser = IncrementalParser::new();
714
715        // Start in object state
716        parser.state = ParserState::InObject {
717            id: 1,
718            generation: 0,
719        };
720
721        // xref should transition from any state
722        parser.feed(b"xref\n").unwrap();
723        assert!(matches!(parser.state, ParserState::InXRef));
724    }
725
726    #[test]
727    fn test_buffer_management() {
728        let mut parser = IncrementalParser::new();
729
730        // Feed data without newlines
731        parser.feed(b"partial").unwrap();
732        assert!(parser.buffer.contains("partial"));
733
734        // Feed completion with newline
735        parser.feed(b" line\n").unwrap();
736
737        // Buffer should be cleared after processing the line
738        assert!(!parser.buffer.contains("partial"));
739    }
740
741    #[test]
742    fn test_multiple_objects_in_sequence() {
743        let mut parser = IncrementalParser::new();
744
745        let content = b"1 0 obj\n<< >>\nendobj\n2 0 obj\n<< >>\nendobj\n";
746        parser.feed(content).unwrap();
747
748        let events = parser.take_events();
749
750        let object_starts: Vec<_> = events
751            .iter()
752            .filter_map(|e| match e {
753                ParseEvent::ObjectStart { id, generation } => Some((*id, *generation)),
754                _ => None,
755            })
756            .collect();
757
758        assert_eq!(object_starts.len(), 2);
759        assert_eq!(object_starts[0], (1, 0));
760        assert_eq!(object_starts[1], (2, 0));
761    }
762}