1use crate::error::{PdfError, Result};
7use crate::parser::{PdfDictionary, PdfObject};
8use std::io::Read;
9
10#[derive(Debug)]
12pub enum ParseEvent {
13 Header { version: String },
15 ObjectStart { id: u32, generation: u16 },
17 ObjectEnd {
19 id: u32,
20 generation: u16,
21 object: PdfObject,
22 },
23 StreamData { object_id: u32, data: Vec<u8> },
25 XRef { entries: Vec<XRefEntry> },
27 Trailer { dict: PdfDictionary },
29 EndOfFile,
31}
32
33#[derive(Debug, Clone)]
35pub struct XRefEntry {
36 pub object_number: u32,
37 pub generation: u16,
38 pub offset: u64,
39 pub in_use: bool,
40}
41
42#[derive(Debug)]
44enum ParserState {
45 Initial,
46 InObject { id: u32, generation: u16 },
47 InStream { object_id: u32 },
48 InXRef,
49 InTrailer,
50 Complete,
51}
52
53pub struct IncrementalParser {
55 state: ParserState,
56 buffer: String,
57 #[allow(dead_code)]
58 line_buffer: String,
59 events: Vec<ParseEvent>,
60}
61
62impl Default for IncrementalParser {
63 fn default() -> Self {
64 Self::new()
65 }
66}
67
68impl IncrementalParser {
69 pub fn new() -> Self {
71 Self {
72 state: ParserState::Initial,
73 buffer: String::new(),
74 line_buffer: String::new(),
75 events: Vec::new(),
76 }
77 }
78
79 pub fn feed(&mut self, data: &[u8]) -> Result<()> {
81 let text = String::from_utf8_lossy(data);
82 self.buffer.push_str(&text);
83
84 while let Some(newline_pos) = self.buffer.find('\n') {
86 let line = self.buffer[..newline_pos].trim().to_string();
87 self.buffer.drain(..=newline_pos);
88
89 self.process_line(&line)?;
90 }
91
92 Ok(())
93 }
94
95 pub fn take_events(&mut self) -> Vec<ParseEvent> {
97 std::mem::take(&mut self.events)
98 }
99
100 pub fn is_complete(&self) -> bool {
102 matches!(self.state, ParserState::Complete)
103 }
104
105 fn process_line(&mut self, line: &str) -> Result<()> {
106 match &self.state {
107 ParserState::Initial => {
108 if let Some(version_part) = line.strip_prefix("%PDF-") {
109 let version = version_part.trim().to_string();
110 self.events.push(ParseEvent::Header { version });
111 } else if let Some((id, gen)) = self.parse_object_header(line) {
112 self.state = ParserState::InObject {
113 id,
114 generation: gen,
115 };
116 self.events.push(ParseEvent::ObjectStart {
117 id,
118 generation: gen,
119 });
120 }
121 }
122 ParserState::InObject { id, generation } => {
123 if line == "endobj" {
124 let object = PdfObject::Null;
126 self.events.push(ParseEvent::ObjectEnd {
127 id: *id,
128 generation: *generation,
129 object,
130 });
131 self.state = ParserState::Initial;
132 } else if line == "stream" {
133 self.state = ParserState::InStream { object_id: *id };
134 }
135 }
136 ParserState::InStream { object_id } => {
137 if line == "endstream" {
138 self.state = ParserState::InObject {
139 id: *object_id,
140 generation: 0,
141 };
142 } else {
143 self.events.push(ParseEvent::StreamData {
144 object_id: *object_id,
145 data: line.as_bytes().to_vec(),
146 });
147 }
148 }
149 ParserState::InXRef => {
150 if line == "trailer" {
151 self.state = ParserState::InTrailer;
152 } else if let Some(_entry) = self.parse_xref_entry(line) {
153 }
155 }
156 ParserState::InTrailer => {
157 if line.starts_with("%%EOF") {
158 self.events.push(ParseEvent::EndOfFile);
159 self.state = ParserState::Complete;
160 }
161 }
162 ParserState::Complete => {
163 }
165 }
166
167 if line == "xref" {
169 self.state = ParserState::InXRef;
170 }
171
172 Ok(())
173 }
174
175 fn parse_object_header(&self, line: &str) -> Option<(u32, u16)> {
176 let parts: Vec<&str> = line.split_whitespace().collect();
177 if parts.len() >= 3 && parts[2] == "obj" {
178 let id = parts[0].parse().ok()?;
179 let gen = parts[1].parse().ok()?;
180 Some((id, gen))
181 } else {
182 None
183 }
184 }
185
186 fn parse_xref_entry(&self, line: &str) -> Option<XRefEntry> {
187 let parts: Vec<&str> = line.split_whitespace().collect();
188 if parts.len() == 3 {
189 let offset = parts[0].parse().ok()?;
190 let generation = parts[1].parse().ok()?;
191 let in_use = parts[2] == "n";
192
193 Some(XRefEntry {
194 object_number: 0, generation,
196 offset,
197 in_use,
198 })
199 } else {
200 None
201 }
202 }
203}
204
205pub fn process_incrementally<R: Read, F>(mut reader: R, mut callback: F) -> Result<()>
207where
208 F: FnMut(ParseEvent) -> Result<()>,
209{
210 let mut parser = IncrementalParser::new();
211 let mut buffer = vec![0u8; 4096];
212
213 loop {
214 match reader.read(&mut buffer) {
215 Ok(0) => break, Ok(n) => {
217 parser.feed(&buffer[..n])?;
218
219 for event in parser.take_events() {
220 callback(event)?;
221 }
222 }
223 Err(e) => return Err(PdfError::Io(e)),
224 }
225 }
226
227 Ok(())
228}
229
230#[cfg(test)]
231mod tests {
232 use super::*;
233
234 #[test]
235 fn test_incremental_parser_creation() {
236 let parser = IncrementalParser::new();
237 assert!(!parser.is_complete());
238 }
239
240 #[test]
241 fn test_parse_header() {
242 let mut parser = IncrementalParser::new();
243 parser.feed(b"%PDF-1.7\n").unwrap();
244
245 let events = parser.take_events();
246 assert_eq!(events.len(), 1);
247
248 match &events[0] {
249 ParseEvent::Header { version } => assert_eq!(version, "1.7"),
250 _ => panic!("Expected Header event"),
251 }
252 }
253
254 #[test]
255 fn test_parse_object() {
256 let mut parser = IncrementalParser::new();
257 let data = b"1 0 obj\n<< /Type /Catalog >>\nendobj\n";
258
259 parser.feed(data).unwrap();
260
261 let events = parser.take_events();
262 assert!(events.len() >= 2);
263
264 match &events[0] {
265 ParseEvent::ObjectStart { id, generation } => {
266 assert_eq!(*id, 1);
267 assert_eq!(*generation, 0);
268 }
269 _ => panic!("Expected ObjectStart event"),
270 }
271 }
272
273 #[test]
274 fn test_parse_stream() {
275 let mut parser = IncrementalParser::new();
276 parser.state = ParserState::InObject {
277 id: 1,
278 generation: 0,
279 };
280
281 let data = b"stream\nHello World\nendstream\n";
282 parser.feed(data).unwrap();
283
284 let events = parser.take_events();
285 assert!(events
286 .iter()
287 .any(|e| matches!(e, ParseEvent::StreamData { .. })));
288 }
289
290 #[test]
291 fn test_parse_eof() {
292 let mut parser = IncrementalParser::new();
293 parser.state = ParserState::InTrailer;
294
295 parser.feed(b"%%EOF\n").unwrap();
296
297 let events = parser.take_events();
298 assert_eq!(events.len(), 1);
299 assert!(matches!(events[0], ParseEvent::EndOfFile));
300 assert!(parser.is_complete());
301 }
302
303 #[test]
304 fn test_process_incrementally() {
305 use std::io::Cursor;
306
307 let data = b"%PDF-1.7\n1 0 obj\n<< >>\nendobj\n%%EOF";
308 let cursor = Cursor::new(data);
309
310 let mut event_count = 0;
311 process_incrementally(cursor, |event| {
312 event_count += 1;
313 match event {
314 ParseEvent::Header { version } => assert_eq!(version, "1.7"),
315 ParseEvent::ObjectStart { id, .. } => assert_eq!(id, 1),
316 _ => {}
317 }
318 Ok(())
319 })
320 .unwrap();
321
322 assert!(event_count > 0);
323 }
324
325 #[test]
326 fn test_parser_state_transitions() {
327 let mut parser = IncrementalParser::new();
328
329 parser.feed(b"%PDF-1.7\n").unwrap();
331
332 parser.feed(b"1 0 obj\n").unwrap();
334 assert!(matches!(parser.state, ParserState::InObject { .. }));
335
336 parser.feed(b"endobj\n").unwrap();
338 assert!(matches!(parser.state, ParserState::Initial));
339
340 parser.feed(b"xref\n").unwrap();
342 assert!(matches!(parser.state, ParserState::InXRef));
343
344 parser.feed(b"trailer\n").unwrap();
346 assert!(matches!(parser.state, ParserState::InTrailer));
347
348 parser.feed(b"%%EOF\n").unwrap();
350 assert!(parser.is_complete());
351 }
352}