Skip to main content

oxidize_pdf/operations/
mod.rs

1//! PDF operations module
2//!
3//! This module provides high-level operations for manipulating PDF documents
4//! such as splitting, merging, rotating pages, and reordering.
5
6pub mod chunk_page_mapper;
7pub mod extract_images;
8pub mod merge;
9pub mod overlay;
10pub mod page_analysis;
11pub mod page_extraction;
12pub mod pdf_ocr_converter;
13pub mod reorder;
14pub mod rotate;
15pub mod semantic_redactor;
16pub mod source_highlighter;
17pub mod split;
18
19pub use chunk_page_mapper::ChunkPageMapper;
20pub use extract_images::{
21    extract_images_from_pages, extract_images_from_pdf, ExtractImagesOptions, ExtractedImage,
22    ImageExtractor,
23};
24pub use merge::{merge_pdf_files, merge_pdfs, MergeInput, MergeOptions, PdfMerger};
25pub use overlay::{overlay_pdf, OverlayOptions, OverlayPosition, PdfOverlay};
26pub use page_analysis::{AnalysisOptions, ContentAnalysis, PageContentAnalyzer, PageType};
27pub use page_extraction::{
28    extract_page, extract_page_range, extract_page_range_to_file, extract_page_to_file,
29    extract_pages, extract_pages_to_file, PageExtractionOptions, PageExtractor,
30};
31pub use pdf_ocr_converter::{ConversionOptions, ConversionResult, PdfOcrConverter};
32pub use reorder::{
33    move_pdf_page, reorder_pdf_pages, reverse_pdf_pages, swap_pdf_pages, PageReorderer,
34    ReorderOptions,
35};
36pub use rotate::{rotate_all_pages, rotate_pdf_pages, PageRotator, RotateOptions, RotationAngle};
37pub use semantic_redactor::{
38    RedactionConfig, RedactionEntry, RedactionReport, RedactionStyle, SemanticRedactor,
39    SemanticRedactorError, SemanticRedactorResult,
40};
41pub use source_highlighter::{
42    fragment_to_highlight_rect, HighlightStyle, IndexedFragment, SourceHighlighter,
43    SourceHighlighterError, SourceHighlighterResult, TextPositionIndex,
44};
45pub use split::{split_into_pages, split_pdf, PdfSplitter, SplitMode, SplitOptions};
46
47use crate::error::PdfError;
48
49/// Result type for operations
50pub type OperationResult<T> = Result<T, OperationError>;
51
52/// Operation-specific errors
53#[derive(Debug, thiserror::Error)]
54pub enum OperationError {
55    /// Page index out of bounds
56    #[error("Page index {0} out of bounds (document has {1} pages)")]
57    PageIndexOutOfBounds(usize, usize),
58
59    /// Invalid page range
60    #[error("Invalid page range: {0}")]
61    InvalidPageRange(String),
62
63    /// No pages to process
64    #[error("No pages to process")]
65    NoPagesToProcess,
66
67    /// Resource conflict during merge
68    #[error("Resource conflict: {0}")]
69    ResourceConflict(String),
70
71    /// Invalid rotation angle
72    #[error("Invalid rotation angle: {0} (must be 0, 90, 180, or 270)")]
73    InvalidRotation(i32),
74
75    /// Parse error
76    #[error("Parse error: {0}")]
77    ParseError(String),
78
79    /// Invalid file path
80    #[error("Invalid file path: {reason}")]
81    InvalidPath { reason: String },
82
83    /// IO error
84    #[error("IO error: {0}")]
85    Io(#[from] std::io::Error),
86
87    /// Core PDF error
88    #[error("PDF error: {0}")]
89    PdfError(#[from] PdfError),
90
91    /// General processing error
92    #[error("Processing error: {0}")]
93    ProcessingError(String),
94}
95
96/// Page range specification
97#[derive(Debug, Clone)]
98pub enum PageRange {
99    /// All pages
100    All,
101    /// Single page (0-based index)
102    Single(usize),
103    /// Range of pages (inclusive, 0-based)
104    Range(usize, usize),
105    /// List of specific pages (0-based indices)
106    List(Vec<usize>),
107}
108
109impl PageRange {
110    /// Parse a page range from a string
111    ///
112    /// Examples:
113    /// - "all" -> All pages
114    /// - "1" -> Single page (converts to 0-based)
115    /// - "1-5" -> Range of pages (converts to 0-based)
116    /// - "1,3,5" -> List of pages (converts to 0-based)
117    pub fn parse(s: &str) -> Result<Self, OperationError> {
118        let s = s.trim();
119
120        if s.eq_ignore_ascii_case("all") {
121            return Ok(PageRange::All);
122        }
123
124        // Try single page
125        if let Ok(page) = s.parse::<usize>() {
126            if page == 0 {
127                return Err(OperationError::InvalidPageRange(
128                    "Page numbers start at 1".to_string(),
129                ));
130            }
131            return Ok(PageRange::Single(page - 1));
132        }
133
134        // Try range (e.g., "1-5")
135        if let Some((start, end)) = s.split_once('-') {
136            let start = start
137                .trim()
138                .parse::<usize>()
139                .map_err(|_| OperationError::InvalidPageRange(format!("Invalid start: {start}")))?;
140            let end = end
141                .trim()
142                .parse::<usize>()
143                .map_err(|_| OperationError::InvalidPageRange(format!("Invalid end: {end}")))?;
144
145            if start == 0 || end == 0 {
146                return Err(OperationError::InvalidPageRange(
147                    "Page numbers start at 1".to_string(),
148                ));
149            }
150
151            if start > end {
152                return Err(OperationError::InvalidPageRange(format!(
153                    "Start {start} is greater than end {end}"
154                )));
155            }
156
157            return Ok(PageRange::Range(start - 1, end - 1));
158        }
159
160        // Try list (e.g., "1,3,5")
161        if s.contains(',') {
162            let pages: Result<Vec<usize>, _> = s
163                .split(',')
164                .map(|p| {
165                    let page = p.trim().parse::<usize>().map_err(|_| {
166                        OperationError::InvalidPageRange(format!("Invalid page: {p}"))
167                    })?;
168                    if page == 0 {
169                        return Err(OperationError::InvalidPageRange(
170                            "Page numbers start at 1".to_string(),
171                        ));
172                    }
173                    Ok(page - 1)
174                })
175                .collect();
176
177            return Ok(PageRange::List(pages?));
178        }
179
180        Err(OperationError::InvalidPageRange(format!(
181            "Invalid format: {s}"
182        )))
183    }
184
185    /// Get the page indices for this range
186    pub fn get_indices(&self, total_pages: usize) -> Result<Vec<usize>, OperationError> {
187        match self {
188            PageRange::All => Ok((0..total_pages).collect()),
189            PageRange::Single(idx) => {
190                if *idx >= total_pages {
191                    Err(OperationError::PageIndexOutOfBounds(*idx, total_pages))
192                } else {
193                    Ok(vec![*idx])
194                }
195            }
196            PageRange::Range(start, end) => {
197                if *start >= total_pages {
198                    Err(OperationError::PageIndexOutOfBounds(*start, total_pages))
199                } else if *end >= total_pages {
200                    Err(OperationError::PageIndexOutOfBounds(*end, total_pages))
201                } else {
202                    Ok((*start..=*end).collect())
203                }
204            }
205            PageRange::List(pages) => {
206                for &page in pages {
207                    if page >= total_pages {
208                        return Err(OperationError::PageIndexOutOfBounds(page, total_pages));
209                    }
210                }
211                Ok(pages.clone())
212            }
213        }
214    }
215}
216
217#[cfg(test)]
218mod error_tests;
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223
224    #[test]
225    fn test_page_range_parsing() {
226        assert!(matches!(PageRange::parse("all").unwrap(), PageRange::All));
227        assert!(matches!(PageRange::parse("ALL").unwrap(), PageRange::All));
228
229        match PageRange::parse("5").unwrap() {
230            PageRange::Single(idx) => assert_eq!(idx, 4),
231            _ => panic!("Expected Single"),
232        }
233
234        match PageRange::parse("2-5").unwrap() {
235            PageRange::Range(start, end) => {
236                assert_eq!(start, 1);
237                assert_eq!(end, 4);
238            }
239            _ => panic!("Expected Range"),
240        }
241
242        match PageRange::parse("1,3,5,7").unwrap() {
243            PageRange::List(pages) => {
244                assert_eq!(pages, vec![0, 2, 4, 6]);
245            }
246            _ => panic!("Expected List"),
247        }
248
249        assert!(PageRange::parse("0").is_err());
250        assert!(PageRange::parse("5-2").is_err());
251        assert!(PageRange::parse("invalid").is_err());
252    }
253
254    #[test]
255    fn test_page_range_indices() {
256        let total = 10;
257
258        assert_eq!(
259            PageRange::All.get_indices(total).unwrap(),
260            vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
261        );
262
263        assert_eq!(PageRange::Single(5).get_indices(total).unwrap(), vec![5]);
264
265        assert_eq!(
266            PageRange::Range(2, 5).get_indices(total).unwrap(),
267            vec![2, 3, 4, 5]
268        );
269
270        assert_eq!(
271            PageRange::List(vec![1, 3, 5]).get_indices(total).unwrap(),
272            vec![1, 3, 5]
273        );
274
275        assert!(PageRange::Single(10).get_indices(total).is_err());
276        assert!(PageRange::Range(8, 15).get_indices(total).is_err());
277    }
278
279    #[test]
280    fn test_page_range_empty_list() {
281        // Test parsing an empty list of pages
282        let result = PageRange::parse("");
283        assert!(result.is_err());
284
285        // Test list with only commas
286        let result2 = PageRange::parse(",,");
287        assert!(result2.is_err());
288    }
289
290    #[test]
291    fn test_page_range_list_with_zero() {
292        // Test that 0 in a list causes error (line 148-151)
293        let result = PageRange::parse("1,0,3");
294        assert!(result.is_err());
295        if let Err(e) = result {
296            match e {
297                OperationError::InvalidPageRange(msg) => {
298                    assert!(msg.contains("Page numbers start at 1"));
299                }
300                _ => panic!("Expected InvalidPageRange error"),
301            }
302        }
303    }
304
305    #[test]
306    fn test_page_range_with_extra_spaces() {
307        // Test parsing with extra spaces in list (line 143-144)
308        match PageRange::parse(" 1 , 3 , 5 ").unwrap() {
309            PageRange::List(pages) => {
310                assert_eq!(pages, vec![0, 2, 4]);
311            }
312            _ => panic!("Expected List"),
313        }
314
315        // Test range with spaces
316        match PageRange::parse(" 2 - 5 ").unwrap() {
317            PageRange::Range(start, end) => {
318                assert_eq!(start, 1);
319                assert_eq!(end, 4);
320            }
321            _ => panic!("Expected Range"),
322        }
323    }
324
325    #[test]
326    fn test_page_range_equal_start_end() {
327        // Test range where start == end (should work)
328        match PageRange::parse("5-5").unwrap() {
329            PageRange::Range(start, end) => {
330                assert_eq!(start, 4);
331                assert_eq!(end, 4);
332            }
333            _ => panic!("Expected Range"),
334        }
335
336        // Verify get_indices works correctly
337        let range = PageRange::Range(4, 4);
338        assert_eq!(range.get_indices(10).unwrap(), vec![4]);
339    }
340
341    #[test]
342    fn test_page_range_list_out_of_bounds() {
343        // Test List variant with out of bounds indices (line 186-190)
344        let pages = PageRange::List(vec![2, 5, 15]);
345        let result = pages.get_indices(10);
346        assert!(result.is_err());
347        if let Err(e) = result {
348            match e {
349                OperationError::PageIndexOutOfBounds(idx, total) => {
350                    assert_eq!(idx, 15);
351                    assert_eq!(total, 10);
352                }
353                _ => panic!("Expected PageIndexOutOfBounds error"),
354            }
355        }
356    }
357
358    #[test]
359    fn test_page_range_empty_document() {
360        // Test get_indices with 0 total pages
361        let total = 0;
362
363        // All should return empty vector
364        assert_eq!(
365            PageRange::All.get_indices(total).unwrap(),
366            Vec::<usize>::new()
367        );
368
369        // Single should fail
370        assert!(PageRange::Single(0).get_indices(total).is_err());
371
372        // Range should fail
373        assert!(PageRange::Range(0, 1).get_indices(total).is_err());
374
375        // Empty list should work
376        assert_eq!(
377            PageRange::List(vec![]).get_indices(total).unwrap(),
378            Vec::<usize>::new()
379        );
380    }
381
382    #[test]
383    fn test_page_range_additional_invalid_formats() {
384        // Test various invalid formats (line 160-162)
385        assert!(PageRange::parse("1-2-3").is_err()); // Multiple dashes
386        assert!(PageRange::parse("abc").is_err()); // Non-numeric
387        assert!(PageRange::parse("1.5").is_err()); // Decimal
388        assert!(PageRange::parse("-5").is_err()); // Negative without start
389        assert!(PageRange::parse("1-").is_err()); // Missing end
390        assert!(PageRange::parse("-").is_err()); // Only dash
391    }
392
393    #[test]
394    fn test_module_exports() {
395        // Verify that all operation types are exported correctly
396        // This test just ensures the module structure is correct
397
398        // We can create these types through their modules
399        use super::extract_images::ExtractImagesOptions;
400        use super::merge::MergeOptions;
401        use super::page_analysis::{AnalysisOptions, PageType};
402        use super::page_extraction::PageExtractionOptions;
403        use super::rotate::{RotateOptions, RotationAngle};
404        use super::split::{SplitMode, SplitOptions};
405
406        // Just verify we can access these types
407        let _extract: ExtractImagesOptions;
408        let _merge: MergeOptions;
409        let _analysis: AnalysisOptions;
410        let _extraction: PageExtractionOptions;
411        let _rotate: RotateOptions;
412        let _split: SplitOptions;
413        let _angle: RotationAngle;
414        let _page_type: PageType;
415        let _mode: SplitMode;
416    }
417
418    #[test]
419    fn test_operation_error_variants() {
420        let errors = vec![
421            OperationError::PageIndexOutOfBounds(5, 3),
422            OperationError::InvalidPageRange("test".to_string()),
423            OperationError::NoPagesToProcess,
424            OperationError::ResourceConflict("test".to_string()),
425            OperationError::InvalidRotation(45),
426            OperationError::ParseError("test".to_string()),
427            OperationError::ProcessingError("test".to_string()),
428        ];
429
430        for error in errors {
431            let message = error.to_string();
432            assert!(!message.is_empty());
433        }
434    }
435
436    #[test]
437    fn test_page_range_edge_cases() {
438        // Test whitespace handling
439        assert!(matches!(
440            PageRange::parse("  all  ").unwrap(),
441            PageRange::All
442        ));
443        assert!(matches!(
444            PageRange::parse(" 5 ").unwrap(),
445            PageRange::Single(4)
446        ));
447
448        // Test various list formats
449        match PageRange::parse(" 1 , 3 , 5 ").unwrap() {
450            PageRange::List(pages) => assert_eq!(pages, vec![0, 2, 4]),
451            _ => panic!("Expected List"),
452        }
453
454        // Test range with spaces
455        match PageRange::parse(" 2 - 5 ").unwrap() {
456            PageRange::Range(start, end) => {
457                assert_eq!(start, 1);
458                assert_eq!(end, 4);
459            }
460            _ => panic!("Expected Range"),
461        }
462    }
463
464    #[test]
465    fn test_page_range_invalid_formats() {
466        // Test various invalid formats
467        assert!(PageRange::parse("").is_err());
468        assert!(PageRange::parse("abc").is_err());
469        assert!(PageRange::parse("1-").is_err());
470        assert!(PageRange::parse("-5").is_err());
471        assert!(PageRange::parse("1-2-3").is_err());
472        assert!(PageRange::parse("1,0,3").is_err());
473        assert!(PageRange::parse("0-5").is_err());
474        assert!(PageRange::parse("5-0").is_err());
475        assert!(PageRange::parse("1,,3").is_err());
476        assert!(PageRange::parse("1.5").is_err());
477    }
478
479    #[test]
480    fn test_page_range_get_indices_empty_document() {
481        let total = 0;
482
483        assert_eq!(
484            PageRange::All.get_indices(total).unwrap(),
485            vec![] as Vec<usize>
486        );
487        assert!(PageRange::Single(0).get_indices(total).is_err());
488        assert!(PageRange::Range(0, 1).get_indices(total).is_err());
489        assert!(PageRange::List(vec![0]).get_indices(total).is_err());
490    }
491
492    #[test]
493    fn test_page_range_get_indices_single_page_document() {
494        let total = 1;
495
496        assert_eq!(PageRange::All.get_indices(total).unwrap(), vec![0]);
497        assert_eq!(PageRange::Single(0).get_indices(total).unwrap(), vec![0]);
498        assert!(PageRange::Single(1).get_indices(total).is_err());
499        assert_eq!(PageRange::Range(0, 0).get_indices(total).unwrap(), vec![0]);
500        assert!(PageRange::Range(0, 1).get_indices(total).is_err());
501    }
502
503    #[test]
504    fn test_page_range_list_duplicates() {
505        // Lists can have duplicates in our implementation
506        match PageRange::parse("1,1,2,2,3").unwrap() {
507            PageRange::List(pages) => {
508                assert_eq!(pages, vec![0, 0, 1, 1, 2]);
509            }
510            _ => panic!("Expected List"),
511        }
512    }
513
514    #[test]
515    fn test_page_range_list_unordered() {
516        // Lists don't need to be ordered
517        match PageRange::parse("5,2,8,1,3").unwrap() {
518            PageRange::List(pages) => {
519                assert_eq!(pages, vec![4, 1, 7, 0, 2]);
520            }
521            _ => panic!("Expected List"),
522        }
523    }
524
525    #[test]
526    fn test_operation_error_display() {
527        let error = OperationError::PageIndexOutOfBounds(10, 5);
528        assert_eq!(
529            error.to_string(),
530            "Page index 10 out of bounds (document has 5 pages)"
531        );
532
533        let error = OperationError::InvalidRotation(45);
534        assert_eq!(
535            error.to_string(),
536            "Invalid rotation angle: 45 (must be 0, 90, 180, or 270)"
537        );
538
539        let error = OperationError::NoPagesToProcess;
540        assert_eq!(error.to_string(), "No pages to process");
541    }
542
543    #[test]
544    fn test_page_range_large_document() {
545        let total = 1000;
546
547        // Test all pages
548        let indices = PageRange::All.get_indices(total).unwrap();
549        assert_eq!(indices.len(), 1000);
550        assert_eq!(indices[0], 0);
551        assert_eq!(indices[999], 999);
552
553        // Test large range
554        let indices = PageRange::Range(100, 200).get_indices(total).unwrap();
555        assert_eq!(indices.len(), 101);
556        assert_eq!(indices[0], 100);
557        assert_eq!(indices[100], 200);
558    }
559
560    #[test]
561    fn test_page_range_parse_case_insensitive() {
562        assert!(matches!(PageRange::parse("all").unwrap(), PageRange::All));
563        assert!(matches!(PageRange::parse("ALL").unwrap(), PageRange::All));
564        assert!(matches!(PageRange::parse("All").unwrap(), PageRange::All));
565        assert!(matches!(PageRange::parse("aLL").unwrap(), PageRange::All));
566    }
567
568    #[test]
569    fn test_operation_result_type() {
570        // Test that OperationResult works correctly
571        fn test_function() -> OperationResult<usize> {
572            Ok(42)
573        }
574
575        fn test_error_function() -> OperationResult<usize> {
576            Err(OperationError::NoPagesToProcess)
577        }
578
579        assert_eq!(test_function().unwrap(), 42);
580        assert!(test_error_function().is_err());
581    }
582
583    #[test]
584    fn test_page_range_boundary_values() {
585        // Test maximum safe values
586        let large_page = usize::MAX / 2;
587
588        match PageRange::parse(&large_page.to_string()).unwrap() {
589            PageRange::Single(idx) => assert_eq!(idx, large_page - 1),
590            _ => panic!("Expected Single"),
591        }
592
593        // Test with actual document
594        let indices = PageRange::Single(5).get_indices(10).unwrap();
595        assert_eq!(indices, vec![5]);
596
597        // Test range boundary
598        let indices = PageRange::Range(0, 9).get_indices(10).unwrap();
599        assert_eq!(indices.len(), 10);
600    }
601
602    #[test]
603    fn test_error_from_io() {
604        use std::io;
605
606        let io_error = io::Error::new(io::ErrorKind::NotFound, "File not found");
607        let op_error: OperationError = io_error.into();
608
609        match op_error {
610            OperationError::Io(_) => {}
611            _ => panic!("Expected Io variant"),
612        }
613    }
614
615    #[test]
616    fn test_page_range_fmt_debug() {
617        // Test Debug implementation
618        let range = PageRange::All;
619        let debug_str = format!("{:?}", range);
620        assert!(debug_str.contains("All"));
621
622        let range = PageRange::Single(5);
623        let debug_str = format!("{:?}", range);
624        assert!(debug_str.contains("Single"));
625        assert!(debug_str.contains("5"));
626
627        let range = PageRange::Range(1, 10);
628        let debug_str = format!("{:?}", range);
629        assert!(debug_str.contains("Range"));
630
631        let range = PageRange::List(vec![1, 2, 3]);
632        let debug_str = format!("{:?}", range);
633        assert!(debug_str.contains("List"));
634    }
635
636    #[test]
637    fn test_page_range_clone() {
638        let original = PageRange::List(vec![1, 2, 3]);
639        let cloned = original.clone();
640
641        match (original, cloned) {
642            (PageRange::List(orig), PageRange::List(clone)) => {
643                assert_eq!(orig, clone);
644            }
645            _ => panic!("Clone failed"),
646        }
647    }
648}