oxidize_pdf/operations/
split.rs

1//! PDF splitting functionality
2//!
3//! This module provides functionality to split PDF documents into multiple files
4//! based on page ranges or other criteria.
5
6use super::{OperationError, OperationResult, PageRange};
7use crate::parser::page_tree::ParsedPage;
8use crate::parser::{ContentOperation, ContentParser, PdfDocument, PdfReader};
9use crate::{Document, Page};
10use std::fs::File;
11use std::path::{Path, PathBuf};
12
13/// Options for PDF splitting
14#[derive(Debug, Clone)]
15pub struct SplitOptions {
16    /// How to split the document
17    pub mode: SplitMode,
18    /// Output file naming pattern
19    pub output_pattern: String,
20    /// Whether to preserve document metadata
21    pub preserve_metadata: bool,
22    /// Whether to optimize output files
23    pub optimize: bool,
24}
25
26impl Default for SplitOptions {
27    fn default() -> Self {
28        Self {
29            mode: SplitMode::SinglePages,
30            output_pattern: "page_{}.pdf".to_string(),
31            preserve_metadata: true,
32            optimize: false,
33        }
34    }
35}
36
37/// Split mode specification
38#[derive(Debug, Clone)]
39pub enum SplitMode {
40    /// Split into single pages
41    SinglePages,
42    /// Split by page ranges
43    Ranges(Vec<PageRange>),
44    /// Split into chunks of N pages
45    ChunkSize(usize),
46    /// Split at specific page numbers (creates files before each split point)
47    SplitAt(Vec<usize>),
48}
49
50/// PDF splitter
51pub struct PdfSplitter {
52    document: PdfDocument<File>,
53    options: SplitOptions,
54}
55
56impl PdfSplitter {
57    /// Create a new PDF splitter
58    pub fn new(document: PdfDocument<File>, options: SplitOptions) -> Self {
59        Self { document, options }
60    }
61
62    /// Split the PDF according to the options
63    pub fn split(&mut self) -> OperationResult<Vec<PathBuf>> {
64        let total_pages =
65            self.document
66                .page_count()
67                .map_err(|e| OperationError::ParseError(e.to_string()))? as usize;
68
69        if total_pages == 0 {
70            return Err(OperationError::NoPagesToProcess);
71        }
72
73        let ranges = match &self.options.mode {
74            SplitMode::SinglePages => {
75                // Create a range for each page
76                (0..total_pages).map(PageRange::Single).collect()
77            }
78            SplitMode::Ranges(ranges) => ranges.clone(),
79            SplitMode::ChunkSize(size) => {
80                // Create ranges for chunks
81                let mut ranges = Vec::new();
82                let mut start = 0;
83                while start < total_pages {
84                    let end = (start + size - 1).min(total_pages - 1);
85                    ranges.push(PageRange::Range(start, end));
86                    start += size;
87                }
88                ranges
89            }
90            SplitMode::SplitAt(split_points) => {
91                // Create ranges between split points
92                let mut ranges = Vec::new();
93                let mut start = 0;
94
95                for &split_point in split_points {
96                    if split_point > 0 && split_point < total_pages {
97                        ranges.push(PageRange::Range(start, split_point - 1));
98                        start = split_point;
99                    }
100                }
101
102                // Add the last range
103                if start < total_pages {
104                    ranges.push(PageRange::Range(start, total_pages - 1));
105                }
106
107                ranges
108            }
109        };
110
111        // Process each range
112        let mut output_files = Vec::new();
113
114        for (index, range) in ranges.iter().enumerate() {
115            let output_path = self.format_output_path(index, range);
116            self.extract_range(range, &output_path)?;
117            output_files.push(output_path);
118        }
119
120        Ok(output_files)
121    }
122
123    /// Extract a page range to a new PDF file
124    fn extract_range(&mut self, range: &PageRange, output_path: &Path) -> OperationResult<()> {
125        let total_pages =
126            self.document
127                .page_count()
128                .map_err(|e| OperationError::ParseError(e.to_string()))? as usize;
129
130        let indices = range.get_indices(total_pages)?;
131        if indices.is_empty() {
132            return Err(OperationError::NoPagesToProcess);
133        }
134
135        // Create new document
136        let mut doc = Document::new();
137
138        // Copy metadata if requested
139        if self.options.preserve_metadata {
140            if let Ok(metadata) = self.document.metadata() {
141                if let Some(title) = metadata.title {
142                    doc.set_title(&title);
143                }
144                if let Some(author) = metadata.author {
145                    doc.set_author(&author);
146                }
147                if let Some(subject) = metadata.subject {
148                    doc.set_subject(&subject);
149                }
150                if let Some(keywords) = metadata.keywords {
151                    doc.set_keywords(&keywords);
152                }
153            }
154        }
155
156        // Extract and add pages
157        for &page_idx in &indices {
158            let parsed_page = self
159                .document
160                .get_page(page_idx as u32)
161                .map_err(|e| OperationError::ParseError(e.to_string()))?;
162
163            let page = self.convert_page(&parsed_page)?;
164            doc.add_page(page);
165        }
166
167        // Save the document
168        doc.save(output_path)?;
169
170        Ok(())
171    }
172
173    /// Convert a parsed page to a new page
174    fn convert_page(&mut self, parsed_page: &ParsedPage) -> OperationResult<Page> {
175        // Create new page with same dimensions
176        let width = parsed_page.width();
177        let height = parsed_page.height();
178        let mut page = Page::new(width, height);
179
180        // Set rotation if needed
181        if parsed_page.rotation != 0 {
182            page.set_rotation(parsed_page.rotation);
183        }
184
185        // Get content streams
186        let content_streams = self
187            .document
188            .get_page_content_streams(parsed_page)
189            .map_err(|e| OperationError::ParseError(e.to_string()))?;
190
191        // Parse and process content streams
192        let mut has_content = false;
193        for stream_data in &content_streams {
194            match ContentParser::parse_content(stream_data) {
195                Ok(operators) => {
196                    // Process the operators to recreate content
197                    self.process_operators(&mut page, &operators)?;
198                    has_content = true;
199                }
200                Err(e) => {
201                    // If parsing fails, fall back to placeholder
202                    eprintln!("Warning: Failed to parse content stream: {e}");
203                }
204            }
205        }
206
207        // If no content was successfully processed, add a placeholder
208        if !has_content {
209            page.text()
210                .set_font(crate::text::Font::Helvetica, 10.0)
211                .at(50.0, height - 50.0)
212                .write("[Page extracted - content reconstruction in progress]")
213                .map_err(OperationError::PdfError)?;
214        }
215
216        Ok(page)
217    }
218
219    /// Process content operators to recreate page content
220    fn process_operators(
221        &self,
222        page: &mut Page,
223        operators: &[ContentOperation],
224    ) -> OperationResult<()> {
225        // Track graphics state
226        let mut text_object = false;
227        let mut current_font = crate::text::Font::Helvetica;
228        let mut current_font_size = 12.0;
229        let mut current_x = 0.0;
230        let mut current_y = 0.0;
231
232        for operator in operators {
233            match operator {
234                ContentOperation::BeginText => {
235                    text_object = true;
236                }
237                ContentOperation::EndText => {
238                    text_object = false;
239                }
240                ContentOperation::SetFont(name, size) => {
241                    // Map PDF font names to our fonts
242                    current_font = match name.as_str() {
243                        "Times-Roman" => crate::text::Font::TimesRoman,
244                        "Times-Bold" => crate::text::Font::TimesBold,
245                        "Times-Italic" => crate::text::Font::TimesItalic,
246                        "Times-BoldItalic" => crate::text::Font::TimesBoldItalic,
247                        "Helvetica-Bold" => crate::text::Font::HelveticaBold,
248                        "Helvetica-Oblique" => crate::text::Font::HelveticaOblique,
249                        "Helvetica-BoldOblique" => crate::text::Font::HelveticaBoldOblique,
250                        "Courier" => crate::text::Font::Courier,
251                        "Courier-Bold" => crate::text::Font::CourierBold,
252                        "Courier-Oblique" => crate::text::Font::CourierOblique,
253                        "Courier-BoldOblique" => crate::text::Font::CourierBoldOblique,
254                        _ => crate::text::Font::Helvetica, // Default fallback
255                    };
256                    current_font_size = *size;
257                }
258                ContentOperation::MoveText(tx, ty) => {
259                    current_x += tx;
260                    current_y += ty;
261                }
262                ContentOperation::ShowText(text_bytes) => {
263                    if text_object {
264                        // Convert bytes to string (assuming ASCII/UTF-8 for now)
265                        if let Ok(text) = String::from_utf8(text_bytes.clone()) {
266                            page.text()
267                                .set_font(current_font.clone(), current_font_size as f64)
268                                .at(current_x as f64, current_y as f64)
269                                .write(&text)
270                                .map_err(OperationError::PdfError)?;
271                        }
272                    }
273                }
274                ContentOperation::Rectangle(x, y, width, height) => {
275                    page.graphics()
276                        .rect(*x as f64, *y as f64, *width as f64, *height as f64);
277                }
278                ContentOperation::MoveTo(x, y) => {
279                    page.graphics().move_to(*x as f64, *y as f64);
280                }
281                ContentOperation::LineTo(x, y) => {
282                    page.graphics().line_to(*x as f64, *y as f64);
283                }
284                ContentOperation::Stroke => {
285                    page.graphics().stroke();
286                }
287                ContentOperation::Fill => {
288                    page.graphics().fill();
289                }
290                ContentOperation::SetNonStrokingRGB(r, g, b) => {
291                    page.graphics().set_fill_color(crate::graphics::Color::Rgb(
292                        *r as f64, *g as f64, *b as f64,
293                    ));
294                }
295                ContentOperation::SetStrokingRGB(r, g, b) => {
296                    page.graphics()
297                        .set_stroke_color(crate::graphics::Color::Rgb(
298                            *r as f64, *g as f64, *b as f64,
299                        ));
300                }
301                ContentOperation::SetLineWidth(width) => {
302                    page.graphics().set_line_width(*width as f64);
303                }
304                // Note: Additional operators can be implemented on demand
305                _ => {
306                    // Silently skip unimplemented operators for now
307                }
308            }
309        }
310
311        Ok(())
312    }
313
314    /// Format the output path based on the pattern
315    fn format_output_path(&self, index: usize, range: &PageRange) -> PathBuf {
316        let filename = match range {
317            PageRange::Single(page) => self
318                .options
319                .output_pattern
320                .replace("{}", &(page + 1).to_string())
321                .replace("{n}", &(index + 1).to_string())
322                .replace("{page}", &(page + 1).to_string()),
323            PageRange::Range(start, end) => self
324                .options
325                .output_pattern
326                .replace("{}", &format!("{}-{}", start + 1, end + 1))
327                .replace("{n}", &(index + 1).to_string())
328                .replace("{start}", &(start + 1).to_string())
329                .replace("{end}", &(end + 1).to_string()),
330            _ => self
331                .options
332                .output_pattern
333                .replace("{}", &(index + 1).to_string())
334                .replace("{n}", &(index + 1).to_string()),
335        };
336
337        PathBuf::from(filename)
338    }
339}
340
341/// Split a PDF file by page ranges
342pub fn split_pdf<P: AsRef<Path>>(
343    input_path: P,
344    options: SplitOptions,
345) -> OperationResult<Vec<PathBuf>> {
346    let document = PdfReader::open_document(input_path)
347        .map_err(|e| OperationError::ParseError(e.to_string()))?;
348
349    let mut splitter = PdfSplitter::new(document, options);
350    splitter.split()
351}
352
353/// Split a PDF file into single pages
354pub fn split_into_pages<P: AsRef<Path>>(
355    input_path: P,
356    output_pattern: &str,
357) -> OperationResult<Vec<PathBuf>> {
358    let options = SplitOptions {
359        mode: SplitMode::SinglePages,
360        output_pattern: output_pattern.to_string(),
361        ..Default::default()
362    };
363
364    split_pdf(input_path, options)
365}
366
367#[cfg(test)]
368mod tests {
369    use super::*;
370
371    #[test]
372    fn test_split_options_default() {
373        let options = SplitOptions::default();
374        assert!(matches!(options.mode, SplitMode::SinglePages));
375        assert_eq!(options.output_pattern, "page_{}.pdf");
376        assert!(options.preserve_metadata);
377        assert!(!options.optimize);
378    }
379
380    #[test]
381    fn test_format_output_path() {
382        let _options = SplitOptions {
383            output_pattern: "output_page_{}.pdf".to_string(),
384            ..Default::default()
385        };
386
387        let _reader = PdfReader::open("test.pdf");
388        // Note: This test would need a valid PDF file to work properly
389        // For now, we're just testing the logic
390    }
391
392    // ============= Additional Split Tests =============
393
394    #[test]
395    fn test_split_mode_variants() {
396        // Test SinglePages variant
397        let single_pages = SplitMode::SinglePages;
398        assert!(matches!(single_pages, SplitMode::SinglePages));
399
400        // Test Ranges variant
401        let ranges = SplitMode::Ranges(vec![
402            super::PageRange::Single(0),
403            super::PageRange::Range(5, 10),
404        ]);
405        assert!(matches!(ranges, SplitMode::Ranges(_)));
406
407        // Test ChunkSize variant
408        let chunk = SplitMode::ChunkSize(5);
409        if let SplitMode::ChunkSize(size) = chunk {
410            assert_eq!(size, 5);
411        } else {
412            panic!("Expected ChunkSize");
413        }
414
415        // Test SplitAt variant
416        let split_at = SplitMode::SplitAt(vec![5, 10, 15]);
417        assert!(matches!(split_at, SplitMode::SplitAt(_)));
418    }
419
420    #[test]
421    fn test_split_options_with_modes() {
422        let options = SplitOptions {
423            mode: SplitMode::ChunkSize(10),
424            output_pattern: "chunk_{}.pdf".to_string(),
425            preserve_metadata: true,
426            optimize: true,
427        };
428
429        assert!(matches!(options.mode, SplitMode::ChunkSize(10)));
430        assert_eq!(options.output_pattern, "chunk_{}.pdf");
431        assert!(options.preserve_metadata);
432        assert!(options.optimize);
433    }
434
435    #[test]
436    fn test_split_options_page_range() {
437        let ranges = vec![
438            super::PageRange::All,
439            super::PageRange::Single(5),
440            super::PageRange::Range(10, 20),
441            super::PageRange::List(vec![1, 3, 5, 7, 9]),
442        ];
443
444        let options = SplitOptions {
445            mode: SplitMode::Ranges(ranges),
446            ..Default::default()
447        };
448
449        if let SplitMode::Ranges(r) = options.mode {
450            assert_eq!(r.len(), 4);
451        } else {
452            panic!("Expected Ranges mode");
453        }
454    }
455
456    #[test]
457    fn test_split_options_split_at() {
458        let split_points = vec![3, 6, 9, 12]; // Split at these page numbers
459
460        let options = SplitOptions {
461            mode: SplitMode::SplitAt(split_points.clone()),
462            output_pattern: "part_{}.pdf".to_string(),
463            ..Default::default()
464        };
465
466        if let SplitMode::SplitAt(points) = options.mode {
467            assert_eq!(points.len(), 4);
468            assert_eq!(points, split_points);
469        } else {
470            panic!("Expected SplitAt mode");
471        }
472    }
473
474    #[test]
475    fn test_output_pattern_formatting() {
476        // Test various output patterns
477        let patterns = vec![
478            "output_{}.pdf",
479            "page_{}.pdf",
480            "document_part_{}.pdf",
481            "{}_split.pdf",
482        ];
483
484        for pattern in patterns {
485            let options = SplitOptions {
486                output_pattern: pattern.to_string(),
487                ..Default::default()
488            };
489            assert!(options.output_pattern.contains("{")); // Just check for placeholder
490        }
491    }
492
493    #[test]
494    fn test_split_options_preserve_metadata() {
495        // Test preserve_metadata flag
496        let with_metadata = SplitOptions {
497            preserve_metadata: true,
498            ..Default::default()
499        };
500        assert!(with_metadata.preserve_metadata);
501
502        let without_metadata = SplitOptions {
503            preserve_metadata: false,
504            ..Default::default()
505        };
506        assert!(!without_metadata.preserve_metadata);
507    }
508
509    #[test]
510    fn test_split_single_pages_mode() {
511        let options = SplitOptions {
512            mode: SplitMode::SinglePages,
513            output_pattern: "page_{:04}.pdf".to_string(),
514            ..Default::default()
515        };
516
517        assert!(matches!(options.mode, SplitMode::SinglePages));
518        assert!(options.output_pattern.contains("{"));
519    }
520
521    #[test]
522    fn test_split_chunk_size_validation() {
523        // Test various chunk sizes
524        let chunk_sizes = vec![1, 5, 10, 50, 100];
525
526        for size in chunk_sizes {
527            let options = SplitOptions {
528                mode: SplitMode::ChunkSize(size),
529                ..Default::default()
530            };
531
532            if let SplitMode::ChunkSize(s) = options.mode {
533                assert_eq!(s, size);
534                assert!(s > 0); // Chunk size should be positive
535            }
536        }
537    }
538
539    #[test]
540    fn test_split_options_optimization() {
541        let optimized = SplitOptions {
542            optimize: true,
543            ..Default::default()
544        };
545        assert!(optimized.optimize);
546
547        let not_optimized = SplitOptions {
548            optimize: false,
549            ..Default::default()
550        };
551        assert!(!not_optimized.optimize);
552    }
553
554    #[test]
555    fn test_split_options_with_custom_pattern() {
556        let options = SplitOptions {
557            output_pattern: "document_part_{}.pdf".to_string(),
558            ..Default::default()
559        };
560        assert_eq!(options.output_pattern, "document_part_{}.pdf");
561    }
562
563    #[test]
564    fn test_split_mode_ranges() {
565        let ranges = vec![
566            PageRange::Single(0),
567            PageRange::Range(1, 3),
568            PageRange::Single(5),
569        ];
570        let mode = SplitMode::Ranges(ranges.clone());
571
572        match mode {
573            SplitMode::Ranges(r) => {
574                assert_eq!(r.len(), 3);
575                assert!(matches!(r[0], PageRange::Single(0)));
576                assert!(matches!(r[1], PageRange::Range(1, 3)));
577                assert!(matches!(r[2], PageRange::Single(5)));
578            }
579            _ => panic!("Wrong mode"),
580        }
581    }
582
583    #[test]
584    fn test_split_mode_split_at() {
585        let split_points = vec![5, 10, 15];
586        let mode = SplitMode::SplitAt(split_points.clone());
587
588        match mode {
589            SplitMode::SplitAt(points) => assert_eq!(points, split_points),
590            _ => panic!("Wrong mode"),
591        }
592    }
593
594    #[test]
595    fn test_page_range_parse() {
596        // Test all pages
597        let range = PageRange::parse("all").unwrap();
598        assert!(matches!(range, PageRange::All));
599
600        // Test single page
601        let range = PageRange::parse("5").unwrap();
602        assert!(matches!(range, PageRange::Single(4))); // 0-indexed
603
604        // Test range
605        let range = PageRange::parse("3-7").unwrap();
606        assert!(matches!(range, PageRange::Range(2, 6))); // 0-indexed
607
608        // Test list
609        let range = PageRange::parse("1,3,5").unwrap();
610        match range {
611            PageRange::List(pages) => assert_eq!(pages, vec![0, 2, 4]),
612            _ => panic!("Expected List"),
613        }
614    }
615
616    #[test]
617    fn test_page_range_invalid_parse() {
618        assert!(PageRange::parse("").is_err());
619        assert!(PageRange::parse("abc").is_err());
620        assert!(PageRange::parse("5-3").is_err()); // Invalid range
621        assert!(PageRange::parse("0").is_err()); // Page numbers start at 1
622    }
623
624    #[test]
625    fn test_split_options_all_fields() {
626        let options = SplitOptions {
627            mode: SplitMode::ChunkSize(5),
628            output_pattern: "chunk_{}.pdf".to_string(),
629            preserve_metadata: false,
630            optimize: true,
631        };
632
633        match options.mode {
634            SplitMode::ChunkSize(size) => assert_eq!(size, 5),
635            _ => panic!("Wrong mode"),
636        }
637        assert_eq!(options.output_pattern, "chunk_{}.pdf");
638        assert!(!options.preserve_metadata);
639        assert!(options.optimize);
640    }
641
642    #[test]
643    fn test_split_mode_chunk_size_edge_cases() {
644        // Chunk size of 1 should be like single pages
645        let mode = SplitMode::ChunkSize(1);
646        match mode {
647            SplitMode::ChunkSize(size) => assert_eq!(size, 1),
648            _ => panic!("Wrong mode"),
649        }
650
651        // Large chunk size
652        let mode = SplitMode::ChunkSize(1000);
653        match mode {
654            SplitMode::ChunkSize(size) => assert_eq!(size, 1000),
655            _ => panic!("Wrong mode"),
656        }
657    }
658
659    #[test]
660    fn test_split_mode_empty_ranges() {
661        let ranges = Vec::new();
662        let mode = SplitMode::Ranges(ranges);
663
664        match mode {
665            SplitMode::Ranges(r) => assert!(r.is_empty()),
666            _ => panic!("Wrong mode"),
667        }
668    }
669
670    #[test]
671    fn test_split_mode_empty_split_points() {
672        let split_points = Vec::new();
673        let mode = SplitMode::SplitAt(split_points);
674
675        match mode {
676            SplitMode::SplitAt(points) => assert!(points.is_empty()),
677            _ => panic!("Wrong mode"),
678        }
679    }
680}
681
682#[cfg(test)]
683#[path = "split_tests.rs"]
684mod split_tests;