oxidize_pdf/streaming/
chunk_processor.rs1use crate::error::Result;
7use std::io::Read;
8
9#[derive(Debug, Clone, PartialEq)]
11pub enum ChunkType {
12 Text,
14 Image,
16 Graphics,
18 Form,
20 Unknown,
22}
23
24#[derive(Debug, Clone)]
26pub struct ContentChunk {
27 pub chunk_type: ChunkType,
29 pub data: Vec<u8>,
31 pub position: u64,
33 pub size: usize,
35 pub page_number: u32,
37}
38
39impl ContentChunk {
40 pub fn new(chunk_type: ChunkType, data: Vec<u8>, position: u64, page_number: u32) -> Self {
42 let size = data.len();
43 Self {
44 chunk_type,
45 data,
46 position,
47 size,
48 page_number,
49 }
50 }
51
52 pub fn is_text(&self) -> bool {
54 self.chunk_type == ChunkType::Text
55 }
56
57 pub fn is_image(&self) -> bool {
59 self.chunk_type == ChunkType::Image
60 }
61
62 pub fn as_text(&self) -> Option<String> {
64 if self.is_text() {
65 Some(String::from_utf8_lossy(&self.data).to_string())
66 } else {
67 None
68 }
69 }
70}
71
72#[derive(Debug, Clone)]
74pub struct ChunkOptions {
75 pub max_chunk_size: usize,
77 pub split_large_objects: bool,
79 pub buffer_size: usize,
81 pub chunk_types: Vec<ChunkType>,
83}
84
85impl Default for ChunkOptions {
86 fn default() -> Self {
87 Self {
88 max_chunk_size: 1024 * 1024, split_large_objects: true,
90 buffer_size: 64 * 1024, chunk_types: vec![
92 ChunkType::Text,
93 ChunkType::Image,
94 ChunkType::Graphics,
95 ChunkType::Form,
96 ],
97 }
98 }
99}
100
101pub struct ChunkProcessor {
103 options: ChunkOptions,
104 current_position: u64,
105 current_page: u32,
106}
107
108impl ChunkProcessor {
109 pub fn new(options: ChunkOptions) -> Self {
111 Self {
112 options,
113 current_position: 0,
114 current_page: 0,
115 }
116 }
117
118 pub fn process_content(&mut self, content: &[u8]) -> Result<Vec<ContentChunk>> {
120 let mut chunks = Vec::new();
121 let mut offset = 0;
122
123 while offset < content.len() {
124 let remaining = content.len() - offset;
125 let chunk_size = remaining.min(self.options.max_chunk_size);
126
127 let chunk_type = self.detect_chunk_type(&content[offset..offset + chunk_size]);
129
130 if !self.options.chunk_types.contains(&chunk_type) {
132 offset += chunk_size;
133 continue;
134 }
135
136 let chunk = ContentChunk::new(
137 chunk_type,
138 content[offset..offset + chunk_size].to_vec(),
139 self.current_position + offset as u64,
140 self.current_page,
141 );
142
143 chunks.push(chunk);
144 offset += chunk_size;
145 }
146
147 self.current_position += content.len() as u64;
148 Ok(chunks)
149 }
150
151 pub fn set_page(&mut self, page_number: u32) {
153 self.current_page = page_number;
154 }
155
156 pub fn reset(&mut self) {
158 self.current_position = 0;
159 self.current_page = 0;
160 }
161
162 fn detect_chunk_type(&self, data: &[u8]) -> ChunkType {
163 if data.starts_with(b"BT") || data.contains(&b'T') && data.contains(&b'j') {
165 ChunkType::Text
166 } else if data.starts_with(b"\xFF\xD8") || data.starts_with(b"\x89PNG") {
167 ChunkType::Image
168 } else if data.contains(&b'm') || data.contains(&b'l') || data.contains(&b'c') {
169 ChunkType::Graphics
170 } else {
171 ChunkType::Unknown
172 }
173 }
174}
175
176pub fn process_in_chunks<R, F>(mut reader: R, options: ChunkOptions, mut callback: F) -> Result<()>
178where
179 R: Read,
180 F: FnMut(ContentChunk) -> Result<()>,
181{
182 let mut processor = ChunkProcessor::new(options.clone());
183 let mut buffer = vec![0u8; options.buffer_size];
184 let mut _position = 0u64;
185
186 loop {
187 match reader.read(&mut buffer) {
188 Ok(0) => break, Ok(n) => {
190 let chunks = processor.process_content(&buffer[..n])?;
191 for chunk in chunks {
192 callback(chunk)?;
193 }
194 _position += n as u64;
195 }
196 Err(e) => return Err(crate::error::PdfError::Io(e)),
197 }
198 }
199
200 Ok(())
201}
202
203#[cfg(test)]
204mod tests {
205 use super::*;
206
207 #[test]
208 fn test_content_chunk() {
209 let chunk = ContentChunk::new(ChunkType::Text, b"Hello World".to_vec(), 1024, 0);
210
211 assert_eq!(chunk.chunk_type, ChunkType::Text);
212 assert_eq!(chunk.size, 11);
213 assert_eq!(chunk.position, 1024);
214 assert_eq!(chunk.page_number, 0);
215 assert!(chunk.is_text());
216 assert!(!chunk.is_image());
217 assert_eq!(chunk.as_text(), Some("Hello World".to_string()));
218 }
219
220 #[test]
221 fn test_chunk_options_default() {
222 let options = ChunkOptions::default();
223 assert_eq!(options.max_chunk_size, 1024 * 1024);
224 assert!(options.split_large_objects);
225 assert_eq!(options.buffer_size, 64 * 1024);
226 assert_eq!(options.chunk_types.len(), 4);
227 }
228
229 #[test]
230 fn test_chunk_processor() {
231 let options = ChunkOptions::default();
232 let mut processor = ChunkProcessor::new(options);
233
234 let content = b"BT /F1 12 Tf 100 700 Td (Hello) Tj ET";
235 let chunks = processor.process_content(content).unwrap();
236
237 assert!(!chunks.is_empty());
238 assert_eq!(chunks[0].chunk_type, ChunkType::Text);
239 assert_eq!(chunks[0].data, content);
240 }
241
242 #[test]
243 fn test_chunk_type_detection() {
244 let processor = ChunkProcessor::new(ChunkOptions::default());
245
246 let text = b"BT /F1 12 Tf (text) Tj ET";
248 assert_eq!(processor.detect_chunk_type(text), ChunkType::Text);
249
250 let jpeg = b"\xFF\xD8\xFF\xE0";
252 assert_eq!(processor.detect_chunk_type(jpeg), ChunkType::Image);
253
254 let png = b"\x89PNG\r\n\x1a\n";
256 assert_eq!(processor.detect_chunk_type(png), ChunkType::Image);
257
258 let graphics = b"100 200 m 300 400 l S";
260 assert_eq!(processor.detect_chunk_type(graphics), ChunkType::Graphics);
261 }
262
263 #[test]
264 fn test_large_content_splitting() {
265 let mut options = ChunkOptions::default();
266 options.max_chunk_size = 10; let mut processor = ChunkProcessor::new(options);
269 let content = b"This is a much longer content that should be split into multiple chunks";
270
271 let chunks = processor.process_content(content).unwrap();
272
273 assert!(chunks.len() > 1);
274 assert!(chunks.iter().all(|c| c.size <= 10));
275 }
276
277 #[test]
278 fn test_chunk_filtering() {
279 let mut options = ChunkOptions::default();
280 options.chunk_types = vec![ChunkType::Text]; let mut processor = ChunkProcessor::new(options);
283
284 let text_content = b"BT (text) Tj ET";
286 let image_content = b"\xFF\xD8\xFF\xE0 image data";
287
288 let text_chunks = processor.process_content(text_content).unwrap();
289 assert_eq!(text_chunks.len(), 1);
290
291 let image_chunks = processor.process_content(image_content).unwrap();
292 assert_eq!(image_chunks.len(), 0); }
294
295 #[test]
296 fn test_process_in_chunks() {
297 use std::io::Cursor;
298
299 let data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET";
300 let cursor = Cursor::new(data);
301 let options = ChunkOptions {
302 buffer_size: 10,
303 ..Default::default()
304 };
305
306 let mut chunks_received = Vec::new();
307 process_in_chunks(cursor, options, |chunk| {
308 chunks_received.push(chunk);
309 Ok(())
310 })
311 .unwrap();
312
313 assert!(!chunks_received.is_empty());
314 }
315
316 #[test]
317 fn test_page_tracking() {
318 let mut processor = ChunkProcessor::new(ChunkOptions::default());
319
320 processor.set_page(5);
321 let content = b"Page 5 content";
322 let chunks = processor.process_content(content).unwrap();
323
324 assert!(!chunks.is_empty());
325 assert_eq!(chunks[0].page_number, 5);
326 }
327
328 #[test]
329 fn test_processor_reset() {
330 let mut processor = ChunkProcessor::new(ChunkOptions::default());
331
332 processor.current_position = 1000;
333 processor.current_page = 10;
334
335 processor.reset();
336
337 assert_eq!(processor.current_position, 0);
338 assert_eq!(processor.current_page, 0);
339 }
340}