oxidize_pdf/streaming/
incremental_parser.rs

1//! Incremental PDF parser for streaming operations
2//!
3//! Parses PDF objects incrementally as they are encountered in the stream,
4//! enabling processing of very large PDFs with minimal memory usage.
5
6use crate::error::{PdfError, Result};
7use crate::parser::{PdfDictionary, PdfObject};
8use std::io::Read;
9
10/// Events emitted during incremental parsing
11#[derive(Debug)]
12pub enum ParseEvent {
13    /// PDF header found
14    Header { version: String },
15    /// Object definition started
16    ObjectStart { id: u32, generation: u16 },
17    /// Object definition completed
18    ObjectEnd {
19        id: u32,
20        generation: u16,
21        object: PdfObject,
22    },
23    /// Stream data chunk
24    StreamData { object_id: u32, data: Vec<u8> },
25    /// Cross-reference table found
26    XRef { entries: Vec<XRefEntry> },
27    /// Trailer dictionary found
28    Trailer { dict: PdfDictionary },
29    /// End of file marker
30    EndOfFile,
31}
32
33/// Cross-reference table entry
34#[derive(Debug, Clone)]
35pub struct XRefEntry {
36    pub object_number: u32,
37    pub generation: u16,
38    pub offset: u64,
39    pub in_use: bool,
40}
41
42/// State of the incremental parser
43#[derive(Debug)]
44enum ParserState {
45    Initial,
46    InObject { id: u32, generation: u16 },
47    InStream { object_id: u32 },
48    InXRef,
49    InTrailer,
50    Complete,
51}
52
53/// Incremental PDF parser
54pub struct IncrementalParser {
55    state: ParserState,
56    buffer: String,
57    #[allow(dead_code)]
58    line_buffer: String,
59    events: Vec<ParseEvent>,
60}
61
62impl Default for IncrementalParser {
63    fn default() -> Self {
64        Self::new()
65    }
66}
67
68impl IncrementalParser {
69    /// Create a new incremental parser
70    pub fn new() -> Self {
71        Self {
72            state: ParserState::Initial,
73            buffer: String::new(),
74            line_buffer: String::new(),
75            events: Vec::new(),
76        }
77    }
78
79    /// Feed data to the parser
80    pub fn feed(&mut self, data: &[u8]) -> Result<()> {
81        let text = String::from_utf8_lossy(data);
82        self.buffer.push_str(&text);
83
84        // Process complete lines
85        while let Some(newline_pos) = self.buffer.find('\n') {
86            let line = self.buffer[..newline_pos].trim().to_string();
87            self.buffer.drain(..=newline_pos);
88
89            self.process_line(&line)?;
90        }
91
92        Ok(())
93    }
94
95    /// Get pending events
96    pub fn take_events(&mut self) -> Vec<ParseEvent> {
97        std::mem::take(&mut self.events)
98    }
99
100    /// Check if parsing is complete
101    pub fn is_complete(&self) -> bool {
102        matches!(self.state, ParserState::Complete)
103    }
104
105    fn process_line(&mut self, line: &str) -> Result<()> {
106        match &self.state {
107            ParserState::Initial => {
108                if let Some(version_part) = line.strip_prefix("%PDF-") {
109                    let version = version_part.trim().to_string();
110                    self.events.push(ParseEvent::Header { version });
111                } else if let Some((id, gen)) = self.parse_object_header(line) {
112                    self.state = ParserState::InObject {
113                        id,
114                        generation: gen,
115                    };
116                    self.events.push(ParseEvent::ObjectStart {
117                        id,
118                        generation: gen,
119                    });
120                }
121            }
122            ParserState::InObject { id, generation } => {
123                if line == "endobj" {
124                    // Create mock object for demonstration
125                    let object = PdfObject::Null;
126                    self.events.push(ParseEvent::ObjectEnd {
127                        id: *id,
128                        generation: *generation,
129                        object,
130                    });
131                    self.state = ParserState::Initial;
132                } else if line == "stream" {
133                    self.state = ParserState::InStream { object_id: *id };
134                }
135            }
136            ParserState::InStream { object_id } => {
137                if line == "endstream" {
138                    self.state = ParserState::InObject {
139                        id: *object_id,
140                        generation: 0,
141                    };
142                } else {
143                    self.events.push(ParseEvent::StreamData {
144                        object_id: *object_id,
145                        data: line.as_bytes().to_vec(),
146                    });
147                }
148            }
149            ParserState::InXRef => {
150                if line == "trailer" {
151                    self.state = ParserState::InTrailer;
152                } else if let Some(_entry) = self.parse_xref_entry(line) {
153                    // Collect entries
154                }
155            }
156            ParserState::InTrailer => {
157                if line.starts_with("%%EOF") {
158                    self.events.push(ParseEvent::EndOfFile);
159                    self.state = ParserState::Complete;
160                }
161            }
162            ParserState::Complete => {
163                // Ignore additional input
164            }
165        }
166
167        // Check for state transitions
168        if line == "xref" {
169            self.state = ParserState::InXRef;
170        }
171
172        Ok(())
173    }
174
175    fn parse_object_header(&self, line: &str) -> Option<(u32, u16)> {
176        let parts: Vec<&str> = line.split_whitespace().collect();
177        if parts.len() >= 3 && parts[2] == "obj" {
178            let id = parts[0].parse().ok()?;
179            let gen = parts[1].parse().ok()?;
180            Some((id, gen))
181        } else {
182            None
183        }
184    }
185
186    fn parse_xref_entry(&self, line: &str) -> Option<XRefEntry> {
187        let parts: Vec<&str> = line.split_whitespace().collect();
188        if parts.len() == 3 {
189            let offset = parts[0].parse().ok()?;
190            let generation = parts[1].parse().ok()?;
191            let in_use = parts[2] == "n";
192
193            Some(XRefEntry {
194                object_number: 0, // Would be set by context
195                generation,
196                offset,
197                in_use,
198            })
199        } else {
200            None
201        }
202    }
203}
204
205/// Process a reader incrementally
206pub fn process_incrementally<R: Read, F>(mut reader: R, mut callback: F) -> Result<()>
207where
208    F: FnMut(ParseEvent) -> Result<()>,
209{
210    let mut parser = IncrementalParser::new();
211    let mut buffer = vec![0u8; 4096];
212
213    loop {
214        match reader.read(&mut buffer) {
215            Ok(0) => break, // EOF
216            Ok(n) => {
217                parser.feed(&buffer[..n])?;
218
219                for event in parser.take_events() {
220                    callback(event)?;
221                }
222            }
223            Err(e) => return Err(PdfError::Io(e)),
224        }
225    }
226
227    Ok(())
228}
229
230#[cfg(test)]
231mod tests {
232    use super::*;
233
234    #[test]
235    fn test_incremental_parser_creation() {
236        let parser = IncrementalParser::new();
237        assert!(!parser.is_complete());
238    }
239
240    #[test]
241    fn test_parse_header() {
242        let mut parser = IncrementalParser::new();
243        parser.feed(b"%PDF-1.7\n").unwrap();
244
245        let events = parser.take_events();
246        assert_eq!(events.len(), 1);
247
248        match &events[0] {
249            ParseEvent::Header { version } => assert_eq!(version, "1.7"),
250            _ => panic!("Expected Header event"),
251        }
252    }
253
254    #[test]
255    fn test_parse_object() {
256        let mut parser = IncrementalParser::new();
257        let data = b"1 0 obj\n<< /Type /Catalog >>\nendobj\n";
258
259        parser.feed(data).unwrap();
260
261        let events = parser.take_events();
262        assert!(events.len() >= 2);
263
264        match &events[0] {
265            ParseEvent::ObjectStart { id, generation } => {
266                assert_eq!(*id, 1);
267                assert_eq!(*generation, 0);
268            }
269            _ => panic!("Expected ObjectStart event"),
270        }
271    }
272
273    #[test]
274    fn test_parse_stream() {
275        let mut parser = IncrementalParser::new();
276        parser.state = ParserState::InObject {
277            id: 1,
278            generation: 0,
279        };
280
281        let data = b"stream\nHello World\nendstream\n";
282        parser.feed(data).unwrap();
283
284        let events = parser.take_events();
285        assert!(events
286            .iter()
287            .any(|e| matches!(e, ParseEvent::StreamData { .. })));
288    }
289
290    #[test]
291    fn test_parse_eof() {
292        let mut parser = IncrementalParser::new();
293        parser.state = ParserState::InTrailer;
294
295        parser.feed(b"%%EOF\n").unwrap();
296
297        let events = parser.take_events();
298        assert_eq!(events.len(), 1);
299        assert!(matches!(events[0], ParseEvent::EndOfFile));
300        assert!(parser.is_complete());
301    }
302
303    #[test]
304    fn test_process_incrementally() {
305        use std::io::Cursor;
306
307        let data = b"%PDF-1.7\n1 0 obj\n<< >>\nendobj\n%%EOF";
308        let cursor = Cursor::new(data);
309
310        let mut event_count = 0;
311        process_incrementally(cursor, |event| {
312            event_count += 1;
313            match event {
314                ParseEvent::Header { version } => assert_eq!(version, "1.7"),
315                ParseEvent::ObjectStart { id, .. } => assert_eq!(id, 1),
316                _ => {}
317            }
318            Ok(())
319        })
320        .unwrap();
321
322        assert!(event_count > 0);
323    }
324
325    #[test]
326    fn test_parser_state_transitions() {
327        let mut parser = IncrementalParser::new();
328
329        // Initial -> Header
330        parser.feed(b"%PDF-1.7\n").unwrap();
331
332        // Header -> Object
333        parser.feed(b"1 0 obj\n").unwrap();
334        assert!(matches!(parser.state, ParserState::InObject { .. }));
335
336        // Object -> Initial
337        parser.feed(b"endobj\n").unwrap();
338        assert!(matches!(parser.state, ParserState::Initial));
339
340        // Initial -> XRef
341        parser.feed(b"xref\n").unwrap();
342        assert!(matches!(parser.state, ParserState::InXRef));
343
344        // XRef -> Trailer
345        parser.feed(b"trailer\n").unwrap();
346        assert!(matches!(parser.state, ParserState::InTrailer));
347
348        // Trailer -> Complete
349        parser.feed(b"%%EOF\n").unwrap();
350        assert!(parser.is_complete());
351    }
352}