Skip to main content

oxidize_pdf/streaming/
page_streamer.rs

1//! Page streaming for incremental page processing
2//!
3//! Provides efficient streaming of PDF pages without loading the entire
4//! document structure into memory.
5
6use crate::error::Result;
7use std::io::{Read, Seek};
8
9/// A page that can be processed in streaming mode
10#[derive(Debug, Clone)]
11pub struct StreamingPage {
12    pub(crate) number: u32,
13    pub(crate) width: f64,
14    pub(crate) height: f64,
15    #[allow(dead_code)]
16    pub(crate) content_offset: u64,
17    #[allow(dead_code)]
18    pub(crate) content_length: usize,
19}
20
21impl StreamingPage {
22    /// Creates a new StreamingPage for testing purposes
23    #[doc(hidden)]
24    pub fn new_for_test(
25        number: u32,
26        width: f64,
27        height: f64,
28        content_offset: u64,
29        content_length: usize,
30    ) -> Self {
31        Self {
32            number,
33            width,
34            height,
35            content_offset,
36            content_length,
37        }
38    }
39
40    /// Get the page number (0-indexed)
41    pub fn number(&self) -> u32 {
42        self.number
43    }
44
45    /// Get page width in points
46    pub fn width(&self) -> f64 {
47        self.width
48    }
49
50    /// Get page height in points
51    pub fn height(&self) -> f64 {
52        self.height
53    }
54
55    /// Extract text from this page in streaming mode
56    pub fn extract_text_streaming(&self) -> Result<String> {
57        // In a real implementation, this would stream the content
58        Ok(format!("Text from page {}", self.number + 1))
59    }
60
61    /// Process content stream in chunks
62    pub fn process_content<F>(&self, mut callback: F) -> Result<()>
63    where
64        F: FnMut(&[u8]) -> Result<()>,
65    {
66        // In a real implementation, this would read content in chunks
67        let mock_content = format!("BT /F1 12 Tf 100 700 Td (Page {}) Tj ET", self.number + 1);
68        callback(mock_content.as_bytes())?;
69        Ok(())
70    }
71
72    /// Get the media box for this page
73    pub fn media_box(&self) -> [f64; 4] {
74        [0.0, 0.0, self.width, self.height]
75    }
76}
77
78/// Streams pages from a PDF document
79pub struct PageStreamer<R: Read + Seek> {
80    #[allow(dead_code)]
81    reader: R,
82    current_page: u32,
83    total_pages: Option<u32>,
84    #[allow(dead_code)]
85    buffer: Vec<u8>,
86}
87
88impl<R: Read + Seek> PageStreamer<R> {
89    /// Create a new page streamer
90    pub fn new(reader: R) -> Self {
91        Self {
92            reader,
93            current_page: 0,
94            total_pages: None,
95            buffer: Vec::with_capacity(4096),
96        }
97    }
98
99    /// Get the next page in the stream
100    #[allow(clippy::should_implement_trait)]
101    pub fn next(&mut self) -> Result<Option<StreamingPage>> {
102        // In a real implementation, this would parse the next page
103        if self.current_page >= 3 {
104            // Mock: only 3 pages
105            return Ok(None);
106        }
107
108        let page = StreamingPage {
109            number: self.current_page,
110            width: 595.0,
111            height: 842.0,
112            content_offset: self.current_page as u64 * 1024,
113            content_length: 512,
114        };
115
116        self.current_page += 1;
117        Ok(Some(page))
118    }
119
120    /// Skip to a specific page
121    pub fn seek_to_page(&mut self, page_num: u32) -> Result<()> {
122        self.current_page = page_num;
123        // In a real implementation, seek in the file
124        Ok(())
125    }
126
127    /// Get total number of pages if known
128    pub fn total_pages(&self) -> Option<u32> {
129        self.total_pages
130    }
131}
132
133/// Iterator adapter for page streaming
134pub struct PageIterator<R: Read + Seek> {
135    streamer: PageStreamer<R>,
136}
137
138impl<R: Read + Seek> PageIterator<R> {
139    pub fn new(reader: R) -> Self {
140        Self {
141            streamer: PageStreamer::new(reader),
142        }
143    }
144}
145
146impl<R: Read + Seek> Iterator for PageIterator<R> {
147    type Item = Result<StreamingPage>;
148
149    fn next(&mut self) -> Option<Self::Item> {
150        match self.streamer.next() {
151            Ok(Some(page)) => Some(Ok(page)),
152            Ok(None) => None,
153            Err(e) => Some(Err(e)),
154        }
155    }
156}
157
158#[cfg(test)]
159mod tests {
160    use super::*;
161    use std::io::Cursor;
162
163    #[test]
164    fn test_streaming_page() {
165        let page = StreamingPage::new_for_test(0, 612.0, 792.0, 1024, 2048);
166
167        assert_eq!(page.number(), 0);
168        assert_eq!(page.width(), 612.0);
169        assert_eq!(page.height(), 792.0);
170
171        let media_box = page.media_box();
172        assert_eq!(media_box, [0.0, 0.0, 612.0, 792.0]);
173    }
174
175    #[test]
176    fn test_extract_text_streaming() {
177        let page = StreamingPage {
178            number: 5,
179            width: 595.0,
180            height: 842.0,
181            content_offset: 0,
182            content_length: 0,
183        };
184
185        let text = page.extract_text_streaming().unwrap();
186        assert!(text.contains("page 6"));
187    }
188
189    #[test]
190    fn test_process_content() {
191        let page = StreamingPage {
192            number: 0,
193            width: 595.0,
194            height: 842.0,
195            content_offset: 0,
196            content_length: 0,
197        };
198
199        let mut chunks = Vec::new();
200        page.process_content(|chunk| {
201            chunks.push(chunk.to_vec());
202            Ok(())
203        })
204        .unwrap();
205
206        assert!(!chunks.is_empty());
207        let content = String::from_utf8_lossy(&chunks[0]);
208        assert!(content.contains("Page 1"));
209    }
210
211    #[test]
212    fn test_page_streamer() {
213        let data = b"%PDF-1.7\n";
214        let cursor = Cursor::new(data);
215        let mut streamer = PageStreamer::new(cursor);
216
217        // Should get first page
218        let page1 = streamer.next().unwrap();
219        assert!(page1.is_some());
220        assert_eq!(page1.unwrap().number(), 0);
221
222        // Should get second page
223        let page2 = streamer.next().unwrap();
224        assert!(page2.is_some());
225        assert_eq!(page2.unwrap().number(), 1);
226    }
227
228    #[test]
229    fn test_page_streamer_seek() {
230        let data = b"%PDF-1.7\n";
231        let cursor = Cursor::new(data);
232        let mut streamer = PageStreamer::new(cursor);
233
234        // Seek to page 2
235        streamer.seek_to_page(2).unwrap();
236
237        let page = streamer.next().unwrap();
238        assert!(page.is_some());
239        assert_eq!(page.unwrap().number(), 2);
240    }
241
242    #[test]
243    fn test_page_iterator() {
244        let data = b"%PDF-1.7\n";
245        let cursor = Cursor::new(data);
246        let iterator = PageIterator::new(cursor);
247
248        let mut pages = Vec::new();
249        for result in iterator {
250            pages.push(result.unwrap());
251        }
252
253        assert_eq!(pages.len(), 3); // Mock returns 3 pages
254        assert_eq!(pages[0].number(), 0);
255        assert_eq!(pages[1].number(), 1);
256        assert_eq!(pages[2].number(), 2);
257    }
258
259    #[test]
260    fn test_page_iterator_for_loop() {
261        let data = b"%PDF-1.7\n";
262        let cursor = Cursor::new(data);
263        let iterator = PageIterator::new(cursor);
264
265        let mut count = 0;
266        for page_result in iterator {
267            let page = page_result.unwrap();
268            assert_eq!(page.number(), count);
269            count += 1;
270        }
271
272        assert_eq!(count, 3);
273    }
274
275    #[test]
276    fn test_streaming_page_debug_clone() {
277        let page = StreamingPage {
278            number: 1,
279            width: 500.0,
280            height: 600.0,
281            content_offset: 2048,
282            content_length: 1024,
283        };
284
285        let debug_str = format!("{page:?}");
286        assert!(debug_str.contains("StreamingPage"));
287        assert!(debug_str.contains("1"));
288
289        let cloned = page.clone();
290        assert_eq!(cloned.number, page.number);
291        assert_eq!(cloned.width, page.width);
292        assert_eq!(cloned.height, page.height);
293        assert_eq!(cloned.content_offset, page.content_offset);
294        assert_eq!(cloned.content_length, page.content_length);
295    }
296
297    #[test]
298    fn test_streaming_page_new_for_test() {
299        let page = StreamingPage::new_for_test(5, 200.0, 300.0, 4096, 512);
300
301        assert_eq!(page.number(), 5);
302        assert_eq!(page.width(), 200.0);
303        assert_eq!(page.height(), 300.0);
304        assert_eq!(page.content_offset, 4096);
305        assert_eq!(page.content_length, 512);
306    }
307
308    #[test]
309    fn test_streaming_page_media_box_various_sizes() {
310        let test_cases = vec![
311            (100.0, 100.0, [0.0, 0.0, 100.0, 100.0]),
312            (612.0, 792.0, [0.0, 0.0, 612.0, 792.0]),
313            (841.89, 1190.55, [0.0, 0.0, 841.89, 1190.55]),
314        ];
315
316        for (width, height, expected) in test_cases {
317            let page = StreamingPage::new_for_test(0, width, height, 0, 0);
318            assert_eq!(page.media_box(), expected);
319        }
320    }
321
322    #[test]
323    fn test_streaming_page_extract_text_different_pages() {
324        for page_num in 0..5 {
325            let page = StreamingPage {
326                number: page_num,
327                width: 595.0,
328                height: 842.0,
329                content_offset: 0,
330                content_length: 0,
331            };
332
333            let text = page.extract_text_streaming().unwrap();
334            assert!(text.contains(&format!("page {}", page_num + 1)));
335        }
336    }
337
338    #[test]
339    fn test_streaming_page_process_content_callback_error() {
340        let page = StreamingPage::new_for_test(0, 595.0, 842.0, 0, 0);
341
342        let result = page.process_content(|_chunk| {
343            Err(crate::error::PdfError::ParseError(
344                "Callback error".to_string(),
345            ))
346        });
347
348        assert!(result.is_err());
349    }
350
351    #[test]
352    fn test_streaming_page_process_content_multiple_calls() {
353        let page = StreamingPage::new_for_test(3, 595.0, 842.0, 0, 0);
354
355        let mut call_count = 0;
356        page.process_content(|chunk| {
357            call_count += 1;
358            assert!(!chunk.is_empty());
359            let content = String::from_utf8_lossy(chunk);
360            assert!(content.contains("Page 4")); // page number + 1
361            Ok(())
362        })
363        .unwrap();
364
365        assert_eq!(call_count, 1);
366    }
367
368    #[test]
369    fn test_page_streamer_creation() {
370        let data = b"test data";
371        let cursor = Cursor::new(data);
372        let streamer = PageStreamer::new(cursor);
373
374        assert_eq!(streamer.current_page, 0);
375        assert_eq!(streamer.total_pages, None);
376        assert_eq!(streamer.buffer.capacity(), 4096);
377    }
378
379    #[test]
380    fn test_page_streamer_total_pages() {
381        let data = b"%PDF-1.7\n";
382        let cursor = Cursor::new(data);
383        let streamer = PageStreamer::new(cursor);
384
385        assert_eq!(streamer.total_pages(), None);
386    }
387
388    #[test]
389    fn test_page_streamer_seek_beyond_pages() {
390        let data = b"%PDF-1.7\n";
391        let cursor = Cursor::new(data);
392        let mut streamer = PageStreamer::new(cursor);
393
394        // Seek to page beyond available pages
395        streamer.seek_to_page(10).unwrap();
396
397        let page = streamer.next().unwrap();
398        assert!(page.is_none()); // Should return None for out-of-range
399    }
400
401    #[test]
402    fn test_page_streamer_exhaustion() {
403        let data = b"%PDF-1.7\n";
404        let cursor = Cursor::new(data);
405        let mut streamer = PageStreamer::new(cursor);
406
407        // Exhaust all pages
408        let _ = streamer.next().unwrap(); // Page 0
409        let _ = streamer.next().unwrap(); // Page 1
410        let _ = streamer.next().unwrap(); // Page 2
411
412        // Should return None after exhaustion
413        let page = streamer.next().unwrap();
414        assert!(page.is_none());
415
416        // Subsequent calls should also return None
417        let page = streamer.next().unwrap();
418        assert!(page.is_none());
419    }
420
421    #[test]
422    fn test_page_streamer_page_properties() {
423        let data = b"%PDF-1.7\n";
424        let cursor = Cursor::new(data);
425        let mut streamer = PageStreamer::new(cursor);
426
427        for expected_page_num in 0..3 {
428            let page = streamer.next().unwrap().unwrap();
429
430            assert_eq!(page.number(), expected_page_num);
431            assert_eq!(page.width(), 595.0); // A4 width
432            assert_eq!(page.height(), 842.0); // A4 height
433            assert_eq!(page.content_offset, expected_page_num as u64 * 1024);
434            assert_eq!(page.content_length, 512);
435        }
436    }
437
438    #[test]
439    fn test_page_iterator_creation() {
440        let data = b"test";
441        let cursor = Cursor::new(data);
442        let iterator = PageIterator::new(cursor);
443
444        assert_eq!(iterator.streamer.current_page, 0);
445    }
446
447    #[test]
448    fn test_page_iterator_collect() {
449        let data = b"%PDF-1.7\n";
450        let cursor = Cursor::new(data);
451        let iterator = PageIterator::new(cursor);
452
453        let pages: Result<Vec<_>> = iterator.collect();
454        let pages = pages.unwrap();
455
456        assert_eq!(pages.len(), 3);
457        for (i, page) in pages.iter().enumerate() {
458            assert_eq!(page.number(), i as u32);
459        }
460    }
461
462    #[test]
463    fn test_page_iterator_take() {
464        let data = b"%PDF-1.7\n";
465        let cursor = Cursor::new(data);
466        let iterator = PageIterator::new(cursor);
467
468        let first_two: Vec<_> = iterator.take(2).collect();
469        assert_eq!(first_two.len(), 2);
470
471        let page0 = &first_two[0].as_ref().unwrap();
472        let page1 = &first_two[1].as_ref().unwrap();
473
474        assert_eq!(page0.number(), 0);
475        assert_eq!(page1.number(), 1);
476    }
477
478    #[test]
479    fn test_page_iterator_skip() {
480        let data = b"%PDF-1.7\n";
481        let cursor = Cursor::new(data);
482        let iterator = PageIterator::new(cursor);
483
484        let last_page: Vec<_> = iterator.skip(2).collect();
485        assert_eq!(last_page.len(), 1);
486
487        let page = &last_page[0].as_ref().unwrap();
488        assert_eq!(page.number(), 2);
489    }
490
491    #[test]
492    fn test_page_iterator_enumerate() {
493        let data = b"%PDF-1.7\n";
494        let cursor = Cursor::new(data);
495        let iterator = PageIterator::new(cursor);
496
497        for (index, page_result) in iterator.enumerate() {
498            let page = page_result.unwrap();
499            assert_eq!(page.number(), index as u32);
500        }
501    }
502
503    #[test]
504    fn test_page_streamer_seek_to_zero() {
505        let data = b"%PDF-1.7\n";
506        let cursor = Cursor::new(data);
507        let mut streamer = PageStreamer::new(cursor);
508
509        // Move forward
510        let _ = streamer.next().unwrap(); // Page 0
511        let _ = streamer.next().unwrap(); // Page 1
512
513        // Seek back to beginning
514        streamer.seek_to_page(0).unwrap();
515
516        let page = streamer.next().unwrap().unwrap();
517        assert_eq!(page.number(), 0);
518    }
519
520    #[test]
521    fn test_page_streamer_seek_middle() {
522        let data = b"%PDF-1.7\n";
523        let cursor = Cursor::new(data);
524        let mut streamer = PageStreamer::new(cursor);
525
526        // Seek to middle page
527        streamer.seek_to_page(1).unwrap();
528
529        let page = streamer.next().unwrap().unwrap();
530        assert_eq!(page.number(), 1);
531
532        // Next call should return page 2
533        let page = streamer.next().unwrap().unwrap();
534        assert_eq!(page.number(), 2);
535    }
536
537    #[test]
538    fn test_streaming_page_zero_dimensions() {
539        let page = StreamingPage::new_for_test(0, 0.0, 0.0, 0, 0);
540
541        assert_eq!(page.width(), 0.0);
542        assert_eq!(page.height(), 0.0);
543        assert_eq!(page.media_box(), [0.0, 0.0, 0.0, 0.0]);
544    }
545
546    #[test]
547    fn test_streaming_page_large_dimensions() {
548        let page = StreamingPage::new_for_test(0, 10000.0, 20000.0, 0, 0);
549
550        assert_eq!(page.width(), 10000.0);
551        assert_eq!(page.height(), 20000.0);
552        assert_eq!(page.media_box(), [0.0, 0.0, 10000.0, 20000.0]);
553    }
554
555    #[test]
556    fn test_page_iterator_empty_after_exhaustion() {
557        let data = b"%PDF-1.7\n";
558        let cursor = Cursor::new(data);
559        let mut iterator = PageIterator::new(cursor);
560
561        // Consume all pages
562        for _ in iterator.by_ref() {}
563
564        // Iterator should be exhausted
565        assert!(iterator.next().is_none());
566    }
567
568    #[test]
569    fn test_streaming_page_content_callback_data() {
570        let page = StreamingPage::new_for_test(7, 595.0, 842.0, 0, 0);
571
572        let mut collected_data = Vec::new();
573        page.process_content(|chunk| {
574            collected_data.extend_from_slice(chunk);
575            Ok(())
576        })
577        .unwrap();
578
579        let content = String::from_utf8_lossy(&collected_data);
580        assert!(content.contains("BT"));
581        assert!(content.contains("Tf"));
582        assert!(content.contains("Td"));
583        assert!(content.contains("Tj"));
584        assert!(content.contains("ET"));
585        assert!(content.contains("Page 8")); // page 7 + 1
586    }
587}