oxidize_pdf/streaming/
chunk_processor.rs

1//! Chunk-based content processing for streaming operations
2//!
3//! Processes PDF content in manageable chunks to maintain
4//! memory efficiency while handling large documents.
5
6use crate::error::Result;
7use std::io::Read;
8
9/// Type of content chunk
10#[derive(Debug, Clone, PartialEq)]
11pub enum ChunkType {
12    /// Text content
13    Text,
14    /// Image data
15    Image,
16    /// Vector graphics
17    Graphics,
18    /// Form XObject
19    Form,
20    /// Unknown or mixed content
21    Unknown,
22}
23
24/// A chunk of PDF content
25#[derive(Debug, Clone)]
26pub struct ContentChunk {
27    /// Type of content in this chunk
28    pub chunk_type: ChunkType,
29    /// Raw data of the chunk
30    pub data: Vec<u8>,
31    /// Position in the document
32    pub position: u64,
33    /// Size of the chunk
34    pub size: usize,
35    /// Page number this chunk belongs to
36    pub page_number: u32,
37}
38
39impl ContentChunk {
40    /// Create a new content chunk
41    pub fn new(chunk_type: ChunkType, data: Vec<u8>, position: u64, page_number: u32) -> Self {
42        let size = data.len();
43        Self {
44            chunk_type,
45            data,
46            position,
47            size,
48            page_number,
49        }
50    }
51
52    /// Check if this is a text chunk
53    pub fn is_text(&self) -> bool {
54        self.chunk_type == ChunkType::Text
55    }
56
57    /// Check if this is an image chunk
58    pub fn is_image(&self) -> bool {
59        self.chunk_type == ChunkType::Image
60    }
61
62    /// Get the chunk data as a string (for text chunks)
63    pub fn as_text(&self) -> Option<String> {
64        if self.is_text() {
65            Some(String::from_utf8_lossy(&self.data).to_string())
66        } else {
67            None
68        }
69    }
70}
71
72/// Options for chunk processing
73#[derive(Debug, Clone)]
74pub struct ChunkOptions {
75    /// Maximum size of a single chunk
76    pub max_chunk_size: usize,
77    /// Whether to split large objects
78    pub split_large_objects: bool,
79    /// Buffer size for reading
80    pub buffer_size: usize,
81    /// Types of chunks to process
82    pub chunk_types: Vec<ChunkType>,
83}
84
85impl Default for ChunkOptions {
86    fn default() -> Self {
87        Self {
88            max_chunk_size: 1024 * 1024, // 1MB
89            split_large_objects: true,
90            buffer_size: 64 * 1024, // 64KB
91            chunk_types: vec![
92                ChunkType::Text,
93                ChunkType::Image,
94                ChunkType::Graphics,
95                ChunkType::Form,
96            ],
97        }
98    }
99}
100
101impl ChunkOptions {
102    /// Validate the chunk options
103    pub fn validate(&self) -> Result<()> {
104        if self.max_chunk_size == 0 {
105            return Err(crate::error::PdfError::InvalidStructure(
106                "max_chunk_size cannot be 0".to_string(),
107            ));
108        }
109        if self.buffer_size == 0 {
110            return Err(crate::error::PdfError::InvalidStructure(
111                "buffer_size cannot be 0".to_string(),
112            ));
113        }
114        Ok(())
115    }
116}
117
118/// Processes PDF content in chunks
119pub struct ChunkProcessor {
120    options: ChunkOptions,
121    current_position: u64,
122    current_page: u32,
123}
124
125impl ChunkProcessor {
126    /// Create a new chunk processor
127    pub fn new(options: ChunkOptions) -> Self {
128        Self {
129            options,
130            current_position: 0,
131            current_page: 0,
132        }
133    }
134
135    /// Process content and yield chunks
136    pub fn process_content(&mut self, content: &[u8]) -> Result<Vec<ContentChunk>> {
137        // Handle edge case where max_chunk_size is 0
138        if self.options.max_chunk_size == 0 {
139            return Ok(vec![]);
140        }
141
142        let mut chunks = Vec::new();
143        let mut offset = 0;
144
145        while offset < content.len() {
146            let remaining = content.len() - offset;
147            let chunk_size = remaining.min(self.options.max_chunk_size);
148
149            // Detect chunk type (simplified)
150            let chunk_type = self.detect_chunk_type(&content[offset..offset + chunk_size]);
151
152            // Skip if not in requested types
153            if !self.options.chunk_types.contains(&chunk_type) {
154                offset += chunk_size;
155                continue;
156            }
157
158            let chunk = ContentChunk::new(
159                chunk_type,
160                content[offset..offset + chunk_size].to_vec(),
161                self.current_position + offset as u64,
162                self.current_page,
163            );
164
165            chunks.push(chunk);
166            offset += chunk_size;
167        }
168
169        self.current_position += content.len() as u64;
170        Ok(chunks)
171    }
172
173    /// Set the current page number
174    pub fn set_page(&mut self, page_number: u32) {
175        self.current_page = page_number;
176    }
177
178    /// Reset the processor state
179    pub fn reset(&mut self) {
180        self.current_position = 0;
181        self.current_page = 0;
182    }
183
184    fn detect_chunk_type(&self, data: &[u8]) -> ChunkType {
185        // Simple heuristic for chunk type detection
186        if data.starts_with(b"BT")
187            || (data.contains(&b'T') && data.contains(&b'j'))
188            || (data.len() == 1 && data[0] == b'T')
189        {
190            ChunkType::Text
191        } else if data.starts_with(b"\xFF\xD8") || data.starts_with(b"\x89PNG") {
192            ChunkType::Image
193        } else if data.contains(&b'm') || data.contains(&b'l') || data.contains(&b'c') {
194            ChunkType::Graphics
195        } else {
196            ChunkType::Unknown
197        }
198    }
199}
200
201/// Process a reader in chunks
202pub fn process_in_chunks<R, F>(mut reader: R, options: ChunkOptions, mut callback: F) -> Result<()>
203where
204    R: Read,
205    F: FnMut(ContentChunk) -> Result<()>,
206{
207    // Validate options first
208    options.validate()?;
209
210    let mut processor = ChunkProcessor::new(options.clone());
211    let mut buffer = vec![0u8; options.buffer_size];
212    let mut _position = 0u64;
213
214    loop {
215        match reader.read(&mut buffer) {
216            Ok(0) => break, // EOF
217            Ok(n) => {
218                let chunks = processor.process_content(&buffer[..n])?;
219                for chunk in chunks {
220                    callback(chunk)?;
221                }
222                _position += n as u64;
223            }
224            Err(e) => return Err(crate::error::PdfError::Io(e)),
225        }
226    }
227
228    Ok(())
229}
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234
235    #[test]
236    fn test_content_chunk() {
237        let chunk = ContentChunk::new(ChunkType::Text, b"Hello World".to_vec(), 1024, 0);
238
239        assert_eq!(chunk.chunk_type, ChunkType::Text);
240        assert_eq!(chunk.size, 11);
241        assert_eq!(chunk.position, 1024);
242        assert_eq!(chunk.page_number, 0);
243        assert!(chunk.is_text());
244        assert!(!chunk.is_image());
245        assert_eq!(chunk.as_text(), Some("Hello World".to_string()));
246    }
247
248    #[test]
249    fn test_chunk_options_default() {
250        let options = ChunkOptions::default();
251        assert_eq!(options.max_chunk_size, 1024 * 1024);
252        assert!(options.split_large_objects);
253        assert_eq!(options.buffer_size, 64 * 1024);
254        assert_eq!(options.chunk_types.len(), 4);
255    }
256
257    #[test]
258    fn test_chunk_processor() {
259        let options = ChunkOptions::default();
260        let mut processor = ChunkProcessor::new(options);
261
262        let content = b"BT /F1 12 Tf 100 700 Td (Hello) Tj ET";
263        let chunks = processor.process_content(content).unwrap();
264
265        assert!(!chunks.is_empty());
266        assert_eq!(chunks[0].chunk_type, ChunkType::Text);
267        assert_eq!(chunks[0].data, content);
268    }
269
270    #[test]
271    fn test_chunk_type_detection() {
272        let processor = ChunkProcessor::new(ChunkOptions::default());
273
274        // Text content
275        let text = b"BT /F1 12 Tf (text) Tj ET";
276        assert_eq!(processor.detect_chunk_type(text), ChunkType::Text);
277
278        // JPEG image
279        let jpeg = b"\xFF\xD8\xFF\xE0";
280        assert_eq!(processor.detect_chunk_type(jpeg), ChunkType::Image);
281
282        // PNG image
283        let png = b"\x89PNG\r\n\x1a\n";
284        assert_eq!(processor.detect_chunk_type(png), ChunkType::Image);
285
286        // Graphics
287        let graphics = b"100 200 m 300 400 l S";
288        assert_eq!(processor.detect_chunk_type(graphics), ChunkType::Graphics);
289    }
290
291    #[test]
292    fn test_large_content_splitting() {
293        let options = ChunkOptions {
294            max_chunk_size: 10, // Very small chunks
295            ..Default::default()
296        };
297
298        let mut processor = ChunkProcessor::new(options);
299        let content = b"This is a much longer content that should be split into multiple chunks";
300
301        let chunks = processor.process_content(content).unwrap();
302
303        assert!(chunks.len() > 1);
304        assert!(chunks.iter().all(|c| c.size <= 10));
305    }
306
307    #[test]
308    fn test_chunk_filtering() {
309        let options = ChunkOptions {
310            chunk_types: vec![ChunkType::Text], // Only process text
311            ..Default::default()
312        };
313
314        let mut processor = ChunkProcessor::new(options);
315
316        // Mix of content types
317        let text_content = b"BT (text) Tj ET";
318        let image_content = b"\xFF\xD8\xFF\xE0 image data";
319
320        let text_chunks = processor.process_content(text_content).unwrap();
321        assert_eq!(text_chunks.len(), 1);
322
323        let image_chunks = processor.process_content(image_content).unwrap();
324        assert_eq!(image_chunks.len(), 0); // Filtered out
325    }
326
327    #[test]
328    fn test_process_in_chunks() {
329        use std::io::Cursor;
330
331        let data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET";
332        let cursor = Cursor::new(data);
333        let options = ChunkOptions {
334            buffer_size: 10,
335            ..Default::default()
336        };
337
338        let mut chunks_received = Vec::new();
339        process_in_chunks(cursor, options, |chunk| {
340            chunks_received.push(chunk);
341            Ok(())
342        })
343        .unwrap();
344
345        assert!(!chunks_received.is_empty());
346    }
347
348    #[test]
349    fn test_page_tracking() {
350        let mut processor = ChunkProcessor::new(ChunkOptions::default());
351
352        processor.set_page(5);
353        let content = b"Page 5 content";
354        let chunks = processor.process_content(content).unwrap();
355
356        assert!(!chunks.is_empty());
357        assert_eq!(chunks[0].page_number, 5);
358    }
359
360    #[test]
361    fn test_processor_reset() {
362        let mut processor = ChunkProcessor::new(ChunkOptions::default());
363
364        processor.current_position = 1000;
365        processor.current_page = 10;
366
367        processor.reset();
368
369        assert_eq!(processor.current_position, 0);
370        assert_eq!(processor.current_page, 0);
371    }
372
373    #[test]
374    fn test_chunk_type_debug_clone_eq() {
375        let types = vec![
376            ChunkType::Text,
377            ChunkType::Image,
378            ChunkType::Graphics,
379            ChunkType::Form,
380            ChunkType::Unknown,
381        ];
382
383        for chunk_type in types {
384            let debug_str = format!("{chunk_type:?}");
385            assert!(!debug_str.is_empty());
386
387            let cloned = chunk_type.clone();
388            assert_eq!(chunk_type, cloned);
389        }
390    }
391
392    #[test]
393    fn test_content_chunk_debug_clone() {
394        let chunk = ContentChunk {
395            chunk_type: ChunkType::Graphics,
396            data: vec![1, 2, 3, 4],
397            position: 512,
398            size: 4,
399            page_number: 2,
400        };
401
402        let debug_str = format!("{chunk:?}");
403        assert!(debug_str.contains("ContentChunk"));
404        assert!(debug_str.contains("Graphics"));
405
406        let cloned = chunk.clone();
407        assert_eq!(cloned.chunk_type, chunk.chunk_type);
408        assert_eq!(cloned.data, chunk.data);
409        assert_eq!(cloned.position, chunk.position);
410        assert_eq!(cloned.size, chunk.size);
411        assert_eq!(cloned.page_number, chunk.page_number);
412    }
413
414    #[test]
415    fn test_chunk_options_debug_clone() {
416        let options = ChunkOptions {
417            max_chunk_size: 2048,
418            split_large_objects: false,
419            buffer_size: 1024,
420            chunk_types: vec![ChunkType::Text, ChunkType::Image],
421        };
422
423        let debug_str = format!("{options:?}");
424        assert!(debug_str.contains("ChunkOptions"));
425
426        let cloned = options.clone();
427        assert_eq!(cloned.max_chunk_size, options.max_chunk_size);
428        assert_eq!(cloned.split_large_objects, options.split_large_objects);
429        assert_eq!(cloned.buffer_size, options.buffer_size);
430        assert_eq!(cloned.chunk_types, options.chunk_types);
431    }
432
433    #[test]
434    fn test_content_chunk_image_methods() {
435        let image_chunk = ContentChunk::new(ChunkType::Image, b"\xFF\xD8\xFF\xE0".to_vec(), 0, 0);
436
437        assert!(image_chunk.is_image());
438        assert!(!image_chunk.is_text());
439        assert_eq!(image_chunk.as_text(), None);
440    }
441
442    #[test]
443    fn test_content_chunk_non_text_as_text() {
444        let graphics_chunk =
445            ContentChunk::new(ChunkType::Graphics, b"100 200 m 300 400 l S".to_vec(), 0, 0);
446
447        assert!(!graphics_chunk.is_text());
448        assert!(!graphics_chunk.is_image());
449        assert_eq!(graphics_chunk.as_text(), None);
450    }
451
452    #[test]
453    fn test_content_chunk_size_calculation() {
454        let data = b"Hello, World!".to_vec();
455        let expected_size = data.len();
456
457        let chunk = ContentChunk::new(ChunkType::Text, data, 100, 1);
458
459        assert_eq!(chunk.size, expected_size);
460        assert_eq!(chunk.size, chunk.data.len());
461    }
462
463    #[test]
464    fn test_chunk_processor_position_tracking() {
465        let mut processor = ChunkProcessor::new(ChunkOptions::default());
466
467        let content1 = b"First chunk";
468        let content2 = b"Second chunk";
469
470        let chunks1 = processor.process_content(content1).unwrap();
471        assert_eq!(chunks1[0].position, 0);
472
473        let chunks2 = processor.process_content(content2).unwrap();
474        assert_eq!(chunks2[0].position, content1.len() as u64);
475    }
476
477    #[test]
478    fn test_detect_chunk_type_edge_cases() {
479        let processor = ChunkProcessor::new(ChunkOptions::default());
480
481        // Empty data
482        assert_eq!(processor.detect_chunk_type(b""), ChunkType::Unknown);
483
484        // Single byte
485        assert_eq!(processor.detect_chunk_type(b"T"), ChunkType::Text);
486
487        // Mixed text with Tj
488        assert_eq!(
489            processor.detect_chunk_type(b"Hello Tj World"),
490            ChunkType::Text
491        );
492
493        // Graphics with multiple markers
494        assert_eq!(processor.detect_chunk_type(b"m l c"), ChunkType::Graphics);
495
496        // Unknown content
497        assert_eq!(processor.detect_chunk_type(b"xyz123"), ChunkType::Unknown);
498    }
499
500    #[test]
501    fn test_chunk_options_all_chunk_types() {
502        let all_types = vec![
503            ChunkType::Text,
504            ChunkType::Image,
505            ChunkType::Graphics,
506            ChunkType::Form,
507            ChunkType::Unknown,
508        ];
509
510        let options = ChunkOptions {
511            chunk_types: all_types.clone(),
512            ..Default::default()
513        };
514
515        assert_eq!(options.chunk_types.len(), 5);
516        assert!(options.chunk_types.contains(&ChunkType::Text));
517        assert!(options.chunk_types.contains(&ChunkType::Image));
518        assert!(options.chunk_types.contains(&ChunkType::Graphics));
519        assert!(options.chunk_types.contains(&ChunkType::Form));
520        assert!(options.chunk_types.contains(&ChunkType::Unknown));
521    }
522
523    #[test]
524    fn test_chunk_filtering_multiple_types() {
525        let mut options = ChunkOptions::default();
526        options.chunk_types = vec![ChunkType::Text, ChunkType::Graphics];
527
528        let mut processor = ChunkProcessor::new(options);
529
530        // Process different types of content
531        let text_content = b"BT (text) Tj ET";
532        let graphics_content = b"100 200 m 300 400 l S";
533        let image_content = b"\xFF\xD8\xFF\xE0";
534
535        let text_chunks = processor.process_content(text_content).unwrap();
536        assert_eq!(text_chunks.len(), 1);
537
538        let graphics_chunks = processor.process_content(graphics_content).unwrap();
539        assert_eq!(graphics_chunks.len(), 1);
540
541        let image_chunks = processor.process_content(image_content).unwrap();
542        assert_eq!(image_chunks.len(), 0); // Filtered out
543    }
544
545    #[test]
546    fn test_process_in_chunks_with_io_error() {
547        use std::io::Error;
548
549        struct ErrorReader;
550
551        impl Read for ErrorReader {
552            fn read(&mut self, _buf: &mut [u8]) -> std::io::Result<usize> {
553                Err(Error::other("Test error"))
554            }
555        }
556
557        let reader = ErrorReader;
558        let options = ChunkOptions::default();
559
560        let result = process_in_chunks(reader, options, |_chunk| Ok(()));
561        assert!(result.is_err());
562    }
563
564    #[test]
565    fn test_process_in_chunks_with_callback_error() {
566        use std::io::Cursor;
567
568        let data = b"BT (text) Tj ET";
569        let cursor = Cursor::new(data);
570        let options = ChunkOptions::default();
571
572        let result = process_in_chunks(cursor, options, |_chunk| {
573            Err(crate::error::PdfError::ParseError(
574                "Callback error".to_string(),
575            ))
576        });
577
578        assert!(result.is_err());
579    }
580
581    #[test]
582    fn test_process_in_chunks_empty_data() {
583        use std::io::Cursor;
584
585        let data = b"";
586        let cursor = Cursor::new(data);
587        let options = ChunkOptions::default();
588
589        let mut chunks_received = Vec::new();
590        process_in_chunks(cursor, options, |chunk| {
591            chunks_received.push(chunk);
592            Ok(())
593        })
594        .unwrap();
595
596        assert!(chunks_received.is_empty());
597    }
598
599    #[test]
600    fn test_chunk_processor_with_zero_max_size() {
601        let mut options = ChunkOptions::default();
602        options.max_chunk_size = 0;
603
604        let mut processor = ChunkProcessor::new(options);
605        let content = b"Some content";
606
607        let chunks = processor.process_content(content).unwrap();
608        // Should handle gracefully, possibly creating no chunks
609        assert!(chunks.is_empty());
610    }
611
612    #[test]
613    fn test_chunk_processor_exact_chunk_size() {
614        let mut options = ChunkOptions::default();
615        options.max_chunk_size = 5;
616
617        let mut processor = ChunkProcessor::new(options);
618        let content = b"Hello"; // Exactly 5 bytes
619
620        let chunks = processor.process_content(content).unwrap();
621        assert_eq!(chunks.len(), 1);
622        assert_eq!(chunks[0].size, 5);
623    }
624
625    #[test]
626    fn test_content_chunk_with_binary_data() {
627        let binary_data = vec![0, 1, 2, 3, 255, 254, 253];
628        let chunk = ContentChunk::new(ChunkType::Image, binary_data.clone(), 0, 0);
629
630        assert_eq!(chunk.data, binary_data);
631        assert_eq!(chunk.size, 7);
632        assert!(chunk.is_image());
633        assert_eq!(chunk.as_text(), None);
634    }
635
636    #[test]
637    fn test_content_chunk_as_text_with_utf8() {
638        let text_data = "Hello, 世界!".as_bytes().to_vec();
639        let chunk = ContentChunk::new(ChunkType::Text, text_data, 0, 0);
640
641        assert_eq!(chunk.as_text(), Some("Hello, 世界!".to_string()));
642    }
643
644    #[test]
645    fn test_content_chunk_as_text_with_invalid_utf8() {
646        let invalid_utf8 = vec![0xFF, 0xFE, 0xFD];
647        let chunk = ContentChunk::new(ChunkType::Text, invalid_utf8, 0, 0);
648
649        // Should handle gracefully with lossy conversion
650        let text = chunk.as_text();
651        assert!(text.is_some());
652        assert!(!text.unwrap().is_empty());
653    }
654
655    #[test]
656    fn test_detect_form_xobject() {
657        let processor = ChunkProcessor::new(ChunkOptions::default());
658
659        // Form XObject content (simplified detection)
660        let form_content = b"q 1 0 0 1 0 0 cm BT /F1 12 Tf (Form) Tj ET Q";
661
662        // Current implementation doesn't specifically detect Form type
663        // but this tests the detection logic
664        let detected_type = processor.detect_chunk_type(form_content);
665        // Will be detected as Text due to BT...Tj pattern
666        assert_eq!(detected_type, ChunkType::Text);
667    }
668
669    #[test]
670    fn test_processor_multiple_pages() {
671        let mut processor = ChunkProcessor::new(ChunkOptions::default());
672
673        // Process content for page 0
674        processor.set_page(0);
675        let content1 = b"Page 0 content";
676        let chunks1 = processor.process_content(content1).unwrap();
677        assert_eq!(chunks1[0].page_number, 0);
678
679        // Process content for page 1
680        processor.set_page(1);
681        let content2 = b"Page 1 content";
682        let chunks2 = processor.process_content(content2).unwrap();
683        assert_eq!(chunks2[0].page_number, 1);
684
685        // Position should continue incrementing
686        assert!(chunks2[0].position > chunks1[0].position);
687    }
688
689    #[test]
690    fn test_chunk_options_empty_chunk_types() {
691        let options = ChunkOptions {
692            chunk_types: vec![], // No chunk types allowed
693            ..Default::default()
694        };
695
696        let mut processor = ChunkProcessor::new(options);
697        let content = b"Any content";
698
699        let chunks = processor.process_content(content).unwrap();
700        assert!(chunks.is_empty()); // All chunks filtered out
701    }
702
703    #[test]
704    fn test_process_in_chunks_large_buffer() {
705        use std::io::Cursor;
706
707        let data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET";
708        let cursor = Cursor::new(data);
709        let options = ChunkOptions {
710            buffer_size: 1024, // Larger than data
711            ..Default::default()
712        };
713
714        let mut chunks_received = Vec::new();
715        process_in_chunks(cursor, options, |chunk| {
716            chunks_received.push(chunk);
717            Ok(())
718        })
719        .unwrap();
720
721        assert!(!chunks_received.is_empty());
722        // Should process all data in one go
723        assert_eq!(chunks_received[0].data, data);
724    }
725
726    #[test]
727    fn test_chunk_options_validation() {
728        let mut options = ChunkOptions::default();
729
730        // Valid options should pass
731        assert!(options.validate().is_ok());
732
733        // Zero max_chunk_size should fail
734        options.max_chunk_size = 0;
735        assert!(options.validate().is_err());
736
737        // Reset and test zero buffer_size
738        options = ChunkOptions::default();
739        options.buffer_size = 0;
740        assert!(options.validate().is_err());
741    }
742
743    #[test]
744    fn test_process_in_chunks_with_invalid_options() {
745        use std::io::Cursor;
746
747        let data = b"test data";
748        let cursor = Cursor::new(data);
749
750        // Test with zero buffer_size
751        let mut options = ChunkOptions::default();
752        options.buffer_size = 0;
753
754        let result = process_in_chunks(cursor, options, |_| Ok(()));
755        assert!(result.is_err());
756
757        // Test with zero max_chunk_size
758        let cursor = Cursor::new(data);
759        let mut options = ChunkOptions::default();
760        options.max_chunk_size = 0;
761
762        let result = process_in_chunks(cursor, options, |_| Ok(()));
763        assert!(result.is_err());
764    }
765}