oxidize_pdf/streaming/
chunk_processor.rs

1//! Chunk-based content processing for streaming operations
2//!
3//! Processes PDF content in manageable chunks to maintain
4//! memory efficiency while handling large documents.
5
6use crate::error::Result;
7use std::io::Read;
8
9/// Type of content chunk
10#[derive(Debug, Clone, PartialEq)]
11pub enum ChunkType {
12    /// Text content
13    Text,
14    /// Image data
15    Image,
16    /// Vector graphics
17    Graphics,
18    /// Form XObject
19    Form,
20    /// Unknown or mixed content
21    Unknown,
22}
23
24/// A chunk of PDF content
25#[derive(Debug, Clone)]
26pub struct ContentChunk {
27    /// Type of content in this chunk
28    pub chunk_type: ChunkType,
29    /// Raw data of the chunk
30    pub data: Vec<u8>,
31    /// Position in the document
32    pub position: u64,
33    /// Size of the chunk
34    pub size: usize,
35    /// Page number this chunk belongs to
36    pub page_number: u32,
37}
38
39impl ContentChunk {
40    /// Create a new content chunk
41    pub fn new(chunk_type: ChunkType, data: Vec<u8>, position: u64, page_number: u32) -> Self {
42        let size = data.len();
43        Self {
44            chunk_type,
45            data,
46            position,
47            size,
48            page_number,
49        }
50    }
51
52    /// Check if this is a text chunk
53    pub fn is_text(&self) -> bool {
54        self.chunk_type == ChunkType::Text
55    }
56
57    /// Check if this is an image chunk
58    pub fn is_image(&self) -> bool {
59        self.chunk_type == ChunkType::Image
60    }
61
62    /// Get the chunk data as a string (for text chunks)
63    pub fn as_text(&self) -> Option<String> {
64        if self.is_text() {
65            Some(String::from_utf8_lossy(&self.data).to_string())
66        } else {
67            None
68        }
69    }
70}
71
72/// Options for chunk processing
73#[derive(Debug, Clone)]
74pub struct ChunkOptions {
75    /// Maximum size of a single chunk
76    pub max_chunk_size: usize,
77    /// Whether to split large objects
78    pub split_large_objects: bool,
79    /// Buffer size for reading
80    pub buffer_size: usize,
81    /// Types of chunks to process
82    pub chunk_types: Vec<ChunkType>,
83}
84
85impl Default for ChunkOptions {
86    fn default() -> Self {
87        Self {
88            max_chunk_size: 1024 * 1024, // 1MB
89            split_large_objects: true,
90            buffer_size: 64 * 1024, // 64KB
91            chunk_types: vec![
92                ChunkType::Text,
93                ChunkType::Image,
94                ChunkType::Graphics,
95                ChunkType::Form,
96            ],
97        }
98    }
99}
100
101/// Processes PDF content in chunks
102pub struct ChunkProcessor {
103    options: ChunkOptions,
104    current_position: u64,
105    current_page: u32,
106}
107
108impl ChunkProcessor {
109    /// Create a new chunk processor
110    pub fn new(options: ChunkOptions) -> Self {
111        Self {
112            options,
113            current_position: 0,
114            current_page: 0,
115        }
116    }
117
118    /// Process content and yield chunks
119    pub fn process_content(&mut self, content: &[u8]) -> Result<Vec<ContentChunk>> {
120        let mut chunks = Vec::new();
121        let mut offset = 0;
122
123        while offset < content.len() {
124            let remaining = content.len() - offset;
125            let chunk_size = remaining.min(self.options.max_chunk_size);
126
127            // Detect chunk type (simplified)
128            let chunk_type = self.detect_chunk_type(&content[offset..offset + chunk_size]);
129
130            // Skip if not in requested types
131            if !self.options.chunk_types.contains(&chunk_type) {
132                offset += chunk_size;
133                continue;
134            }
135
136            let chunk = ContentChunk::new(
137                chunk_type,
138                content[offset..offset + chunk_size].to_vec(),
139                self.current_position + offset as u64,
140                self.current_page,
141            );
142
143            chunks.push(chunk);
144            offset += chunk_size;
145        }
146
147        self.current_position += content.len() as u64;
148        Ok(chunks)
149    }
150
151    /// Set the current page number
152    pub fn set_page(&mut self, page_number: u32) {
153        self.current_page = page_number;
154    }
155
156    /// Reset the processor state
157    pub fn reset(&mut self) {
158        self.current_position = 0;
159        self.current_page = 0;
160    }
161
162    fn detect_chunk_type(&self, data: &[u8]) -> ChunkType {
163        // Simple heuristic for chunk type detection
164        if data.starts_with(b"BT") || data.contains(&b'T') && data.contains(&b'j') {
165            ChunkType::Text
166        } else if data.starts_with(b"\xFF\xD8") || data.starts_with(b"\x89PNG") {
167            ChunkType::Image
168        } else if data.contains(&b'm') || data.contains(&b'l') || data.contains(&b'c') {
169            ChunkType::Graphics
170        } else {
171            ChunkType::Unknown
172        }
173    }
174}
175
176/// Process a reader in chunks
177pub fn process_in_chunks<R, F>(mut reader: R, options: ChunkOptions, mut callback: F) -> Result<()>
178where
179    R: Read,
180    F: FnMut(ContentChunk) -> Result<()>,
181{
182    let mut processor = ChunkProcessor::new(options.clone());
183    let mut buffer = vec![0u8; options.buffer_size];
184    let mut _position = 0u64;
185
186    loop {
187        match reader.read(&mut buffer) {
188            Ok(0) => break, // EOF
189            Ok(n) => {
190                let chunks = processor.process_content(&buffer[..n])?;
191                for chunk in chunks {
192                    callback(chunk)?;
193                }
194                _position += n as u64;
195            }
196            Err(e) => return Err(crate::error::PdfError::Io(e)),
197        }
198    }
199
200    Ok(())
201}
202
203#[cfg(test)]
204mod tests {
205    use super::*;
206
207    #[test]
208    fn test_content_chunk() {
209        let chunk = ContentChunk::new(ChunkType::Text, b"Hello World".to_vec(), 1024, 0);
210
211        assert_eq!(chunk.chunk_type, ChunkType::Text);
212        assert_eq!(chunk.size, 11);
213        assert_eq!(chunk.position, 1024);
214        assert_eq!(chunk.page_number, 0);
215        assert!(chunk.is_text());
216        assert!(!chunk.is_image());
217        assert_eq!(chunk.as_text(), Some("Hello World".to_string()));
218    }
219
220    #[test]
221    fn test_chunk_options_default() {
222        let options = ChunkOptions::default();
223        assert_eq!(options.max_chunk_size, 1024 * 1024);
224        assert!(options.split_large_objects);
225        assert_eq!(options.buffer_size, 64 * 1024);
226        assert_eq!(options.chunk_types.len(), 4);
227    }
228
229    #[test]
230    fn test_chunk_processor() {
231        let options = ChunkOptions::default();
232        let mut processor = ChunkProcessor::new(options);
233
234        let content = b"BT /F1 12 Tf 100 700 Td (Hello) Tj ET";
235        let chunks = processor.process_content(content).unwrap();
236
237        assert!(!chunks.is_empty());
238        assert_eq!(chunks[0].chunk_type, ChunkType::Text);
239        assert_eq!(chunks[0].data, content);
240    }
241
242    #[test]
243    fn test_chunk_type_detection() {
244        let processor = ChunkProcessor::new(ChunkOptions::default());
245
246        // Text content
247        let text = b"BT /F1 12 Tf (text) Tj ET";
248        assert_eq!(processor.detect_chunk_type(text), ChunkType::Text);
249
250        // JPEG image
251        let jpeg = b"\xFF\xD8\xFF\xE0";
252        assert_eq!(processor.detect_chunk_type(jpeg), ChunkType::Image);
253
254        // PNG image
255        let png = b"\x89PNG\r\n\x1a\n";
256        assert_eq!(processor.detect_chunk_type(png), ChunkType::Image);
257
258        // Graphics
259        let graphics = b"100 200 m 300 400 l S";
260        assert_eq!(processor.detect_chunk_type(graphics), ChunkType::Graphics);
261    }
262
263    #[test]
264    fn test_large_content_splitting() {
265        let mut options = ChunkOptions::default();
266        options.max_chunk_size = 10; // Very small chunks
267
268        let mut processor = ChunkProcessor::new(options);
269        let content = b"This is a much longer content that should be split into multiple chunks";
270
271        let chunks = processor.process_content(content).unwrap();
272
273        assert!(chunks.len() > 1);
274        assert!(chunks.iter().all(|c| c.size <= 10));
275    }
276
277    #[test]
278    fn test_chunk_filtering() {
279        let mut options = ChunkOptions::default();
280        options.chunk_types = vec![ChunkType::Text]; // Only process text
281
282        let mut processor = ChunkProcessor::new(options);
283
284        // Mix of content types
285        let text_content = b"BT (text) Tj ET";
286        let image_content = b"\xFF\xD8\xFF\xE0 image data";
287
288        let text_chunks = processor.process_content(text_content).unwrap();
289        assert_eq!(text_chunks.len(), 1);
290
291        let image_chunks = processor.process_content(image_content).unwrap();
292        assert_eq!(image_chunks.len(), 0); // Filtered out
293    }
294
295    #[test]
296    fn test_process_in_chunks() {
297        use std::io::Cursor;
298
299        let data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET";
300        let cursor = Cursor::new(data);
301        let options = ChunkOptions {
302            buffer_size: 10,
303            ..Default::default()
304        };
305
306        let mut chunks_received = Vec::new();
307        process_in_chunks(cursor, options, |chunk| {
308            chunks_received.push(chunk);
309            Ok(())
310        })
311        .unwrap();
312
313        assert!(!chunks_received.is_empty());
314    }
315
316    #[test]
317    fn test_page_tracking() {
318        let mut processor = ChunkProcessor::new(ChunkOptions::default());
319
320        processor.set_page(5);
321        let content = b"Page 5 content";
322        let chunks = processor.process_content(content).unwrap();
323
324        assert!(!chunks.is_empty());
325        assert_eq!(chunks[0].page_number, 5);
326    }
327
328    #[test]
329    fn test_processor_reset() {
330        let mut processor = ChunkProcessor::new(ChunkOptions::default());
331
332        processor.current_position = 1000;
333        processor.current_page = 10;
334
335        processor.reset();
336
337        assert_eq!(processor.current_position, 0);
338        assert_eq!(processor.current_page, 0);
339    }
340}