1use crate::error::Result;
7use std::io::Read;
8
9#[derive(Debug, Clone, PartialEq)]
11pub enum ChunkType {
12 Text,
14 Image,
16 Graphics,
18 Form,
20 Unknown,
22}
23
24#[derive(Debug, Clone)]
26pub struct ContentChunk {
27 pub chunk_type: ChunkType,
29 pub data: Vec<u8>,
31 pub position: u64,
33 pub size: usize,
35 pub page_number: u32,
37}
38
39impl ContentChunk {
40 pub fn new(chunk_type: ChunkType, data: Vec<u8>, position: u64, page_number: u32) -> Self {
42 let size = data.len();
43 Self {
44 chunk_type,
45 data,
46 position,
47 size,
48 page_number,
49 }
50 }
51
52 pub fn is_text(&self) -> bool {
54 self.chunk_type == ChunkType::Text
55 }
56
57 pub fn is_image(&self) -> bool {
59 self.chunk_type == ChunkType::Image
60 }
61
62 pub fn as_text(&self) -> Option<String> {
64 if self.is_text() {
65 Some(String::from_utf8_lossy(&self.data).to_string())
66 } else {
67 None
68 }
69 }
70}
71
72#[derive(Debug, Clone)]
74pub struct ChunkOptions {
75 pub max_chunk_size: usize,
77 pub split_large_objects: bool,
79 pub buffer_size: usize,
81 pub chunk_types: Vec<ChunkType>,
83}
84
85impl Default for ChunkOptions {
86 fn default() -> Self {
87 Self {
88 max_chunk_size: 1024 * 1024, split_large_objects: true,
90 buffer_size: 64 * 1024, chunk_types: vec![
92 ChunkType::Text,
93 ChunkType::Image,
94 ChunkType::Graphics,
95 ChunkType::Form,
96 ],
97 }
98 }
99}
100
101impl ChunkOptions {
102 pub fn validate(&self) -> Result<()> {
104 if self.max_chunk_size == 0 {
105 return Err(crate::error::PdfError::InvalidStructure(
106 "max_chunk_size cannot be 0".to_string(),
107 ));
108 }
109 if self.buffer_size == 0 {
110 return Err(crate::error::PdfError::InvalidStructure(
111 "buffer_size cannot be 0".to_string(),
112 ));
113 }
114 Ok(())
115 }
116}
117
118pub struct ChunkProcessor {
120 options: ChunkOptions,
121 current_position: u64,
122 current_page: u32,
123}
124
125impl ChunkProcessor {
126 pub fn new(options: ChunkOptions) -> Self {
128 Self {
129 options,
130 current_position: 0,
131 current_page: 0,
132 }
133 }
134
135 pub fn process_content(&mut self, content: &[u8]) -> Result<Vec<ContentChunk>> {
137 if self.options.max_chunk_size == 0 {
139 return Ok(vec![]);
140 }
141
142 let mut chunks = Vec::new();
143 let mut offset = 0;
144
145 while offset < content.len() {
146 let remaining = content.len() - offset;
147 let chunk_size = remaining.min(self.options.max_chunk_size);
148
149 let chunk_type = self.detect_chunk_type(&content[offset..offset + chunk_size]);
151
152 if !self.options.chunk_types.contains(&chunk_type) {
154 offset += chunk_size;
155 continue;
156 }
157
158 let chunk = ContentChunk::new(
159 chunk_type,
160 content[offset..offset + chunk_size].to_vec(),
161 self.current_position + offset as u64,
162 self.current_page,
163 );
164
165 chunks.push(chunk);
166 offset += chunk_size;
167 }
168
169 self.current_position += content.len() as u64;
170 Ok(chunks)
171 }
172
173 pub fn set_page(&mut self, page_number: u32) {
175 self.current_page = page_number;
176 }
177
178 pub fn reset(&mut self) {
180 self.current_position = 0;
181 self.current_page = 0;
182 }
183
184 fn detect_chunk_type(&self, data: &[u8]) -> ChunkType {
185 if data.starts_with(b"BT")
187 || (data.contains(&b'T') && data.contains(&b'j'))
188 || (data.len() == 1 && data[0] == b'T')
189 {
190 ChunkType::Text
191 } else if data.starts_with(b"\xFF\xD8") || data.starts_with(b"\x89PNG") {
192 ChunkType::Image
193 } else if data.contains(&b'm') || data.contains(&b'l') || data.contains(&b'c') {
194 ChunkType::Graphics
195 } else {
196 ChunkType::Unknown
197 }
198 }
199}
200
201pub fn process_in_chunks<R, F>(mut reader: R, options: ChunkOptions, mut callback: F) -> Result<()>
203where
204 R: Read,
205 F: FnMut(ContentChunk) -> Result<()>,
206{
207 options.validate()?;
209
210 let mut processor = ChunkProcessor::new(options.clone());
211 let mut buffer = vec![0u8; options.buffer_size];
212 let mut _position = 0u64;
213
214 loop {
215 match reader.read(&mut buffer) {
216 Ok(0) => break, Ok(n) => {
218 let chunks = processor.process_content(&buffer[..n])?;
219 for chunk in chunks {
220 callback(chunk)?;
221 }
222 _position += n as u64;
223 }
224 Err(e) => return Err(crate::error::PdfError::Io(e)),
225 }
226 }
227
228 Ok(())
229}
230
231#[cfg(test)]
232mod tests {
233 use super::*;
234
235 #[test]
236 fn test_content_chunk() {
237 let chunk = ContentChunk::new(ChunkType::Text, b"Hello World".to_vec(), 1024, 0);
238
239 assert_eq!(chunk.chunk_type, ChunkType::Text);
240 assert_eq!(chunk.size, 11);
241 assert_eq!(chunk.position, 1024);
242 assert_eq!(chunk.page_number, 0);
243 assert!(chunk.is_text());
244 assert!(!chunk.is_image());
245 assert_eq!(chunk.as_text(), Some("Hello World".to_string()));
246 }
247
248 #[test]
249 fn test_chunk_options_default() {
250 let options = ChunkOptions::default();
251 assert_eq!(options.max_chunk_size, 1024 * 1024);
252 assert!(options.split_large_objects);
253 assert_eq!(options.buffer_size, 64 * 1024);
254 assert_eq!(options.chunk_types.len(), 4);
255 }
256
257 #[test]
258 fn test_chunk_processor() {
259 let options = ChunkOptions::default();
260 let mut processor = ChunkProcessor::new(options);
261
262 let content = b"BT /F1 12 Tf 100 700 Td (Hello) Tj ET";
263 let chunks = processor.process_content(content).unwrap();
264
265 assert!(!chunks.is_empty());
266 assert_eq!(chunks[0].chunk_type, ChunkType::Text);
267 assert_eq!(chunks[0].data, content);
268 }
269
270 #[test]
271 fn test_chunk_type_detection() {
272 let processor = ChunkProcessor::new(ChunkOptions::default());
273
274 let text = b"BT /F1 12 Tf (text) Tj ET";
276 assert_eq!(processor.detect_chunk_type(text), ChunkType::Text);
277
278 let jpeg = b"\xFF\xD8\xFF\xE0";
280 assert_eq!(processor.detect_chunk_type(jpeg), ChunkType::Image);
281
282 let png = b"\x89PNG\r\n\x1a\n";
284 assert_eq!(processor.detect_chunk_type(png), ChunkType::Image);
285
286 let graphics = b"100 200 m 300 400 l S";
288 assert_eq!(processor.detect_chunk_type(graphics), ChunkType::Graphics);
289 }
290
291 #[test]
292 fn test_large_content_splitting() {
293 let options = ChunkOptions {
294 max_chunk_size: 10, ..Default::default()
296 };
297
298 let mut processor = ChunkProcessor::new(options);
299 let content = b"This is a much longer content that should be split into multiple chunks";
300
301 let chunks = processor.process_content(content).unwrap();
302
303 assert!(chunks.len() > 1);
304 assert!(chunks.iter().all(|c| c.size <= 10));
305 }
306
307 #[test]
308 fn test_chunk_filtering() {
309 let options = ChunkOptions {
310 chunk_types: vec![ChunkType::Text], ..Default::default()
312 };
313
314 let mut processor = ChunkProcessor::new(options);
315
316 let text_content = b"BT (text) Tj ET";
318 let image_content = b"\xFF\xD8\xFF\xE0 image data";
319
320 let text_chunks = processor.process_content(text_content).unwrap();
321 assert_eq!(text_chunks.len(), 1);
322
323 let image_chunks = processor.process_content(image_content).unwrap();
324 assert_eq!(image_chunks.len(), 0); }
326
327 #[test]
328 fn test_process_in_chunks() {
329 use std::io::Cursor;
330
331 let data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET";
332 let cursor = Cursor::new(data);
333 let options = ChunkOptions {
334 buffer_size: 10,
335 ..Default::default()
336 };
337
338 let mut chunks_received = Vec::new();
339 process_in_chunks(cursor, options, |chunk| {
340 chunks_received.push(chunk);
341 Ok(())
342 })
343 .unwrap();
344
345 assert!(!chunks_received.is_empty());
346 }
347
348 #[test]
349 fn test_page_tracking() {
350 let mut processor = ChunkProcessor::new(ChunkOptions::default());
351
352 processor.set_page(5);
353 let content = b"Page 5 content";
354 let chunks = processor.process_content(content).unwrap();
355
356 assert!(!chunks.is_empty());
357 assert_eq!(chunks[0].page_number, 5);
358 }
359
360 #[test]
361 fn test_processor_reset() {
362 let mut processor = ChunkProcessor::new(ChunkOptions::default());
363
364 processor.current_position = 1000;
365 processor.current_page = 10;
366
367 processor.reset();
368
369 assert_eq!(processor.current_position, 0);
370 assert_eq!(processor.current_page, 0);
371 }
372
373 #[test]
374 fn test_chunk_type_debug_clone_eq() {
375 let types = vec![
376 ChunkType::Text,
377 ChunkType::Image,
378 ChunkType::Graphics,
379 ChunkType::Form,
380 ChunkType::Unknown,
381 ];
382
383 for chunk_type in types {
384 let debug_str = format!("{chunk_type:?}");
385 assert!(!debug_str.is_empty());
386
387 let cloned = chunk_type.clone();
388 assert_eq!(chunk_type, cloned);
389 }
390 }
391
392 #[test]
393 fn test_content_chunk_debug_clone() {
394 let chunk = ContentChunk {
395 chunk_type: ChunkType::Graphics,
396 data: vec![1, 2, 3, 4],
397 position: 512,
398 size: 4,
399 page_number: 2,
400 };
401
402 let debug_str = format!("{chunk:?}");
403 assert!(debug_str.contains("ContentChunk"));
404 assert!(debug_str.contains("Graphics"));
405
406 let cloned = chunk.clone();
407 assert_eq!(cloned.chunk_type, chunk.chunk_type);
408 assert_eq!(cloned.data, chunk.data);
409 assert_eq!(cloned.position, chunk.position);
410 assert_eq!(cloned.size, chunk.size);
411 assert_eq!(cloned.page_number, chunk.page_number);
412 }
413
414 #[test]
415 fn test_chunk_options_debug_clone() {
416 let options = ChunkOptions {
417 max_chunk_size: 2048,
418 split_large_objects: false,
419 buffer_size: 1024,
420 chunk_types: vec![ChunkType::Text, ChunkType::Image],
421 };
422
423 let debug_str = format!("{options:?}");
424 assert!(debug_str.contains("ChunkOptions"));
425
426 let cloned = options.clone();
427 assert_eq!(cloned.max_chunk_size, options.max_chunk_size);
428 assert_eq!(cloned.split_large_objects, options.split_large_objects);
429 assert_eq!(cloned.buffer_size, options.buffer_size);
430 assert_eq!(cloned.chunk_types, options.chunk_types);
431 }
432
433 #[test]
434 fn test_content_chunk_image_methods() {
435 let image_chunk = ContentChunk::new(ChunkType::Image, b"\xFF\xD8\xFF\xE0".to_vec(), 0, 0);
436
437 assert!(image_chunk.is_image());
438 assert!(!image_chunk.is_text());
439 assert_eq!(image_chunk.as_text(), None);
440 }
441
442 #[test]
443 fn test_content_chunk_non_text_as_text() {
444 let graphics_chunk =
445 ContentChunk::new(ChunkType::Graphics, b"100 200 m 300 400 l S".to_vec(), 0, 0);
446
447 assert!(!graphics_chunk.is_text());
448 assert!(!graphics_chunk.is_image());
449 assert_eq!(graphics_chunk.as_text(), None);
450 }
451
452 #[test]
453 fn test_content_chunk_size_calculation() {
454 let data = b"Hello, World!".to_vec();
455 let expected_size = data.len();
456
457 let chunk = ContentChunk::new(ChunkType::Text, data, 100, 1);
458
459 assert_eq!(chunk.size, expected_size);
460 assert_eq!(chunk.size, chunk.data.len());
461 }
462
463 #[test]
464 fn test_chunk_processor_position_tracking() {
465 let mut processor = ChunkProcessor::new(ChunkOptions::default());
466
467 let content1 = b"First chunk";
468 let content2 = b"Second chunk";
469
470 let chunks1 = processor.process_content(content1).unwrap();
471 assert_eq!(chunks1[0].position, 0);
472
473 let chunks2 = processor.process_content(content2).unwrap();
474 assert_eq!(chunks2[0].position, content1.len() as u64);
475 }
476
477 #[test]
478 fn test_detect_chunk_type_edge_cases() {
479 let processor = ChunkProcessor::new(ChunkOptions::default());
480
481 assert_eq!(processor.detect_chunk_type(b""), ChunkType::Unknown);
483
484 assert_eq!(processor.detect_chunk_type(b"T"), ChunkType::Text);
486
487 assert_eq!(
489 processor.detect_chunk_type(b"Hello Tj World"),
490 ChunkType::Text
491 );
492
493 assert_eq!(processor.detect_chunk_type(b"m l c"), ChunkType::Graphics);
495
496 assert_eq!(processor.detect_chunk_type(b"xyz123"), ChunkType::Unknown);
498 }
499
500 #[test]
501 fn test_chunk_options_all_chunk_types() {
502 let all_types = vec![
503 ChunkType::Text,
504 ChunkType::Image,
505 ChunkType::Graphics,
506 ChunkType::Form,
507 ChunkType::Unknown,
508 ];
509
510 let options = ChunkOptions {
511 chunk_types: all_types.clone(),
512 ..Default::default()
513 };
514
515 assert_eq!(options.chunk_types.len(), 5);
516 assert!(options.chunk_types.contains(&ChunkType::Text));
517 assert!(options.chunk_types.contains(&ChunkType::Image));
518 assert!(options.chunk_types.contains(&ChunkType::Graphics));
519 assert!(options.chunk_types.contains(&ChunkType::Form));
520 assert!(options.chunk_types.contains(&ChunkType::Unknown));
521 }
522
523 #[test]
524 fn test_chunk_filtering_multiple_types() {
525 let mut options = ChunkOptions::default();
526 options.chunk_types = vec![ChunkType::Text, ChunkType::Graphics];
527
528 let mut processor = ChunkProcessor::new(options);
529
530 let text_content = b"BT (text) Tj ET";
532 let graphics_content = b"100 200 m 300 400 l S";
533 let image_content = b"\xFF\xD8\xFF\xE0";
534
535 let text_chunks = processor.process_content(text_content).unwrap();
536 assert_eq!(text_chunks.len(), 1);
537
538 let graphics_chunks = processor.process_content(graphics_content).unwrap();
539 assert_eq!(graphics_chunks.len(), 1);
540
541 let image_chunks = processor.process_content(image_content).unwrap();
542 assert_eq!(image_chunks.len(), 0); }
544
545 #[test]
546 fn test_process_in_chunks_with_io_error() {
547 use std::io::Error;
548
549 struct ErrorReader;
550
551 impl Read for ErrorReader {
552 fn read(&mut self, _buf: &mut [u8]) -> std::io::Result<usize> {
553 Err(Error::other("Test error"))
554 }
555 }
556
557 let reader = ErrorReader;
558 let options = ChunkOptions::default();
559
560 let result = process_in_chunks(reader, options, |_chunk| Ok(()));
561 assert!(result.is_err());
562 }
563
564 #[test]
565 fn test_process_in_chunks_with_callback_error() {
566 use std::io::Cursor;
567
568 let data = b"BT (text) Tj ET";
569 let cursor = Cursor::new(data);
570 let options = ChunkOptions::default();
571
572 let result = process_in_chunks(cursor, options, |_chunk| {
573 Err(crate::error::PdfError::ParseError(
574 "Callback error".to_string(),
575 ))
576 });
577
578 assert!(result.is_err());
579 }
580
581 #[test]
582 fn test_process_in_chunks_empty_data() {
583 use std::io::Cursor;
584
585 let data = b"";
586 let cursor = Cursor::new(data);
587 let options = ChunkOptions::default();
588
589 let mut chunks_received = Vec::new();
590 process_in_chunks(cursor, options, |chunk| {
591 chunks_received.push(chunk);
592 Ok(())
593 })
594 .unwrap();
595
596 assert!(chunks_received.is_empty());
597 }
598
599 #[test]
600 fn test_chunk_processor_with_zero_max_size() {
601 let mut options = ChunkOptions::default();
602 options.max_chunk_size = 0;
603
604 let mut processor = ChunkProcessor::new(options);
605 let content = b"Some content";
606
607 let chunks = processor.process_content(content).unwrap();
608 assert!(chunks.is_empty());
610 }
611
612 #[test]
613 fn test_chunk_processor_exact_chunk_size() {
614 let mut options = ChunkOptions::default();
615 options.max_chunk_size = 5;
616
617 let mut processor = ChunkProcessor::new(options);
618 let content = b"Hello"; let chunks = processor.process_content(content).unwrap();
621 assert_eq!(chunks.len(), 1);
622 assert_eq!(chunks[0].size, 5);
623 }
624
625 #[test]
626 fn test_content_chunk_with_binary_data() {
627 let binary_data = vec![0, 1, 2, 3, 255, 254, 253];
628 let chunk = ContentChunk::new(ChunkType::Image, binary_data.clone(), 0, 0);
629
630 assert_eq!(chunk.data, binary_data);
631 assert_eq!(chunk.size, 7);
632 assert!(chunk.is_image());
633 assert_eq!(chunk.as_text(), None);
634 }
635
636 #[test]
637 fn test_content_chunk_as_text_with_utf8() {
638 let text_data = "Hello, 世界!".as_bytes().to_vec();
639 let chunk = ContentChunk::new(ChunkType::Text, text_data, 0, 0);
640
641 assert_eq!(chunk.as_text(), Some("Hello, 世界!".to_string()));
642 }
643
644 #[test]
645 fn test_content_chunk_as_text_with_invalid_utf8() {
646 let invalid_utf8 = vec![0xFF, 0xFE, 0xFD];
647 let chunk = ContentChunk::new(ChunkType::Text, invalid_utf8, 0, 0);
648
649 let text = chunk.as_text();
651 assert!(text.is_some());
652 assert!(!text.unwrap().is_empty());
653 }
654
655 #[test]
656 fn test_detect_form_xobject() {
657 let processor = ChunkProcessor::new(ChunkOptions::default());
658
659 let form_content = b"q 1 0 0 1 0 0 cm BT /F1 12 Tf (Form) Tj ET Q";
661
662 let detected_type = processor.detect_chunk_type(form_content);
665 assert_eq!(detected_type, ChunkType::Text);
667 }
668
669 #[test]
670 fn test_processor_multiple_pages() {
671 let mut processor = ChunkProcessor::new(ChunkOptions::default());
672
673 processor.set_page(0);
675 let content1 = b"Page 0 content";
676 let chunks1 = processor.process_content(content1).unwrap();
677 assert_eq!(chunks1[0].page_number, 0);
678
679 processor.set_page(1);
681 let content2 = b"Page 1 content";
682 let chunks2 = processor.process_content(content2).unwrap();
683 assert_eq!(chunks2[0].page_number, 1);
684
685 assert!(chunks2[0].position > chunks1[0].position);
687 }
688
689 #[test]
690 fn test_chunk_options_empty_chunk_types() {
691 let options = ChunkOptions {
692 chunk_types: vec![], ..Default::default()
694 };
695
696 let mut processor = ChunkProcessor::new(options);
697 let content = b"Any content";
698
699 let chunks = processor.process_content(content).unwrap();
700 assert!(chunks.is_empty()); }
702
703 #[test]
704 fn test_process_in_chunks_large_buffer() {
705 use std::io::Cursor;
706
707 let data = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET";
708 let cursor = Cursor::new(data);
709 let options = ChunkOptions {
710 buffer_size: 1024, ..Default::default()
712 };
713
714 let mut chunks_received = Vec::new();
715 process_in_chunks(cursor, options, |chunk| {
716 chunks_received.push(chunk);
717 Ok(())
718 })
719 .unwrap();
720
721 assert!(!chunks_received.is_empty());
722 assert_eq!(chunks_received[0].data, data);
724 }
725
726 #[test]
727 fn test_chunk_options_validation() {
728 let mut options = ChunkOptions::default();
729
730 assert!(options.validate().is_ok());
732
733 options.max_chunk_size = 0;
735 assert!(options.validate().is_err());
736
737 options = ChunkOptions::default();
739 options.buffer_size = 0;
740 assert!(options.validate().is_err());
741 }
742
743 #[test]
744 fn test_process_in_chunks_with_invalid_options() {
745 use std::io::Cursor;
746
747 let data = b"test data";
748 let cursor = Cursor::new(data);
749
750 let mut options = ChunkOptions::default();
752 options.buffer_size = 0;
753
754 let result = process_in_chunks(cursor, options, |_| Ok(()));
755 assert!(result.is_err());
756
757 let cursor = Cursor::new(data);
759 let mut options = ChunkOptions::default();
760 options.max_chunk_size = 0;
761
762 let result = process_in_chunks(cursor, options, |_| Ok(()));
763 assert!(result.is_err());
764 }
765}