oxidize_pdf/operations/
split.rs

1//! PDF splitting functionality
2//!
3//! This module provides functionality to split PDF documents into multiple files
4//! based on page ranges or other criteria.
5
6use super::{OperationError, OperationResult, PageRange};
7use crate::parser::page_tree::ParsedPage;
8use crate::parser::{ContentOperation, ContentParser, PdfDocument, PdfReader};
9use crate::{Document, Page};
10use std::fs::File;
11use std::path::{Path, PathBuf};
12
13/// Options for PDF splitting
14#[derive(Debug, Clone)]
15pub struct SplitOptions {
16    /// How to split the document
17    pub mode: SplitMode,
18    /// Output file naming pattern
19    pub output_pattern: String,
20    /// Whether to preserve document metadata
21    pub preserve_metadata: bool,
22    /// Whether to optimize output files
23    pub optimize: bool,
24}
25
26impl Default for SplitOptions {
27    fn default() -> Self {
28        Self {
29            mode: SplitMode::SinglePages,
30            output_pattern: "page_{}.pdf".to_string(),
31            preserve_metadata: true,
32            optimize: false,
33        }
34    }
35}
36
37/// Split mode specification
38#[derive(Debug, Clone)]
39pub enum SplitMode {
40    /// Split into single pages
41    SinglePages,
42    /// Split by page ranges
43    Ranges(Vec<PageRange>),
44    /// Split into chunks of N pages
45    ChunkSize(usize),
46    /// Split at specific page numbers (creates files before each split point)
47    SplitAt(Vec<usize>),
48}
49
50/// PDF splitter
51pub struct PdfSplitter {
52    document: PdfDocument<File>,
53    options: SplitOptions,
54}
55
56impl PdfSplitter {
57    /// Create a new PDF splitter
58    pub fn new(document: PdfDocument<File>, options: SplitOptions) -> Self {
59        Self { document, options }
60    }
61
62    /// Split the PDF according to the options
63    pub fn split(&mut self) -> OperationResult<Vec<PathBuf>> {
64        let total_pages =
65            self.document
66                .page_count()
67                .map_err(|e| OperationError::ParseError(e.to_string()))? as usize;
68
69        if total_pages == 0 {
70            return Err(OperationError::NoPagesToProcess);
71        }
72
73        let ranges = match &self.options.mode {
74            SplitMode::SinglePages => {
75                // Create a range for each page
76                (0..total_pages).map(PageRange::Single).collect()
77            }
78            SplitMode::Ranges(ranges) => ranges.clone(),
79            SplitMode::ChunkSize(size) => {
80                // Create ranges for chunks
81                let mut ranges = Vec::new();
82                let mut start = 0;
83                while start < total_pages {
84                    let end = (start + size - 1).min(total_pages - 1);
85                    ranges.push(PageRange::Range(start, end));
86                    start += size;
87                }
88                ranges
89            }
90            SplitMode::SplitAt(split_points) => {
91                // Create ranges between split points
92                let mut ranges = Vec::new();
93                let mut start = 0;
94
95                for &split_point in split_points {
96                    if split_point > 0 && split_point < total_pages {
97                        ranges.push(PageRange::Range(start, split_point - 1));
98                        start = split_point;
99                    }
100                }
101
102                // Add the last range
103                if start < total_pages {
104                    ranges.push(PageRange::Range(start, total_pages - 1));
105                }
106
107                ranges
108            }
109        };
110
111        // Process each range
112        let mut output_files = Vec::new();
113
114        for (index, range) in ranges.iter().enumerate() {
115            let output_path = self.format_output_path(index, range);
116            self.extract_range(range, &output_path)?;
117            output_files.push(output_path);
118        }
119
120        Ok(output_files)
121    }
122
123    /// Extract a page range to a new PDF file
124    fn extract_range(&mut self, range: &PageRange, output_path: &Path) -> OperationResult<()> {
125        let total_pages =
126            self.document
127                .page_count()
128                .map_err(|e| OperationError::ParseError(e.to_string()))? as usize;
129
130        let indices = range.get_indices(total_pages)?;
131        if indices.is_empty() {
132            return Err(OperationError::NoPagesToProcess);
133        }
134
135        // Create new document
136        let mut doc = Document::new();
137
138        // Copy metadata if requested
139        if self.options.preserve_metadata {
140            if let Ok(metadata) = self.document.metadata() {
141                if let Some(title) = metadata.title {
142                    doc.set_title(&title);
143                }
144                if let Some(author) = metadata.author {
145                    doc.set_author(&author);
146                }
147                if let Some(subject) = metadata.subject {
148                    doc.set_subject(&subject);
149                }
150                if let Some(keywords) = metadata.keywords {
151                    doc.set_keywords(&keywords);
152                }
153            }
154        }
155
156        // Extract and add pages
157        for &page_idx in &indices {
158            let parsed_page = self
159                .document
160                .get_page(page_idx as u32)
161                .map_err(|e| OperationError::ParseError(e.to_string()))?;
162
163            let page = self.convert_page(&parsed_page)?;
164            doc.add_page(page);
165        }
166
167        // Save the document
168        doc.save(output_path)?;
169
170        Ok(())
171    }
172
173    /// Convert a parsed page to a new page
174    fn convert_page(&mut self, parsed_page: &ParsedPage) -> OperationResult<Page> {
175        // Create new page with same dimensions
176        let width = parsed_page.width();
177        let height = parsed_page.height();
178        let mut page = Page::new(width, height);
179
180        // Set rotation if needed
181        if parsed_page.rotation != 0 {
182            // TODO: Implement rotation in Page
183            // For now, we'll handle this when we implement the rotation feature
184        }
185
186        // Get content streams
187        let content_streams = self
188            .document
189            .get_page_content_streams(parsed_page)
190            .map_err(|e| OperationError::ParseError(e.to_string()))?;
191
192        // Parse and process content streams
193        let mut has_content = false;
194        for stream_data in &content_streams {
195            match ContentParser::parse_content(stream_data) {
196                Ok(operators) => {
197                    // Process the operators to recreate content
198                    self.process_operators(&mut page, &operators)?;
199                    has_content = true;
200                }
201                Err(e) => {
202                    // If parsing fails, fall back to placeholder
203                    eprintln!("Warning: Failed to parse content stream: {e}");
204                }
205            }
206        }
207
208        // If no content was successfully processed, add a placeholder
209        if !has_content {
210            page.text()
211                .set_font(crate::text::Font::Helvetica, 10.0)
212                .at(50.0, height - 50.0)
213                .write("[Page extracted - content reconstruction in progress]")
214                .map_err(OperationError::PdfError)?;
215        }
216
217        Ok(page)
218    }
219
220    /// Process content operators to recreate page content
221    fn process_operators(
222        &self,
223        page: &mut Page,
224        operators: &[ContentOperation],
225    ) -> OperationResult<()> {
226        // Track graphics state
227        let mut text_object = false;
228        let mut current_font = crate::text::Font::Helvetica;
229        let mut current_font_size = 12.0;
230        let mut current_x = 0.0;
231        let mut current_y = 0.0;
232
233        for operator in operators {
234            match operator {
235                ContentOperation::BeginText => {
236                    text_object = true;
237                }
238                ContentOperation::EndText => {
239                    text_object = false;
240                }
241                ContentOperation::SetFont(name, size) => {
242                    // Map PDF font names to our fonts
243                    current_font = match name.as_str() {
244                        "Times-Roman" => crate::text::Font::TimesRoman,
245                        "Times-Bold" => crate::text::Font::TimesBold,
246                        "Times-Italic" => crate::text::Font::TimesItalic,
247                        "Times-BoldItalic" => crate::text::Font::TimesBoldItalic,
248                        "Helvetica-Bold" => crate::text::Font::HelveticaBold,
249                        "Helvetica-Oblique" => crate::text::Font::HelveticaOblique,
250                        "Helvetica-BoldOblique" => crate::text::Font::HelveticaBoldOblique,
251                        "Courier" => crate::text::Font::Courier,
252                        "Courier-Bold" => crate::text::Font::CourierBold,
253                        "Courier-Oblique" => crate::text::Font::CourierOblique,
254                        "Courier-BoldOblique" => crate::text::Font::CourierBoldOblique,
255                        _ => crate::text::Font::Helvetica, // Default fallback
256                    };
257                    current_font_size = *size;
258                }
259                ContentOperation::MoveText(tx, ty) => {
260                    current_x += tx;
261                    current_y += ty;
262                }
263                ContentOperation::ShowText(text_bytes) => {
264                    if text_object {
265                        // Convert bytes to string (assuming ASCII/UTF-8 for now)
266                        if let Ok(text) = String::from_utf8(text_bytes.clone()) {
267                            page.text()
268                                .set_font(current_font, current_font_size as f64)
269                                .at(current_x as f64, current_y as f64)
270                                .write(&text)
271                                .map_err(OperationError::PdfError)?;
272                        }
273                    }
274                }
275                ContentOperation::Rectangle(x, y, width, height) => {
276                    page.graphics()
277                        .rect(*x as f64, *y as f64, *width as f64, *height as f64);
278                }
279                ContentOperation::MoveTo(x, y) => {
280                    page.graphics().move_to(*x as f64, *y as f64);
281                }
282                ContentOperation::LineTo(x, y) => {
283                    page.graphics().line_to(*x as f64, *y as f64);
284                }
285                ContentOperation::Stroke => {
286                    page.graphics().stroke();
287                }
288                ContentOperation::Fill => {
289                    page.graphics().fill();
290                }
291                ContentOperation::SetNonStrokingRGB(r, g, b) => {
292                    page.graphics().set_fill_color(crate::graphics::Color::Rgb(
293                        *r as f64, *g as f64, *b as f64,
294                    ));
295                }
296                ContentOperation::SetStrokingRGB(r, g, b) => {
297                    page.graphics()
298                        .set_stroke_color(crate::graphics::Color::Rgb(
299                            *r as f64, *g as f64, *b as f64,
300                        ));
301                }
302                ContentOperation::SetLineWidth(width) => {
303                    page.graphics().set_line_width(*width as f64);
304                }
305                // TODO: Implement more operators as needed
306                _ => {
307                    // Silently skip unimplemented operators for now
308                }
309            }
310        }
311
312        Ok(())
313    }
314
315    /// Format the output path based on the pattern
316    fn format_output_path(&self, index: usize, range: &PageRange) -> PathBuf {
317        let filename = match range {
318            PageRange::Single(page) => self
319                .options
320                .output_pattern
321                .replace("{}", &(page + 1).to_string())
322                .replace("{n}", &(index + 1).to_string())
323                .replace("{page}", &(page + 1).to_string()),
324            PageRange::Range(start, end) => self
325                .options
326                .output_pattern
327                .replace("{}", &format!("{}-{}", start + 1, end + 1))
328                .replace("{n}", &(index + 1).to_string())
329                .replace("{start}", &(start + 1).to_string())
330                .replace("{end}", &(end + 1).to_string()),
331            _ => self
332                .options
333                .output_pattern
334                .replace("{}", &(index + 1).to_string())
335                .replace("{n}", &(index + 1).to_string()),
336        };
337
338        PathBuf::from(filename)
339    }
340}
341
342/// Split a PDF file by page ranges
343pub fn split_pdf<P: AsRef<Path>>(
344    input_path: P,
345    options: SplitOptions,
346) -> OperationResult<Vec<PathBuf>> {
347    let document = PdfReader::open_document(input_path)
348        .map_err(|e| OperationError::ParseError(e.to_string()))?;
349
350    let mut splitter = PdfSplitter::new(document, options);
351    splitter.split()
352}
353
354/// Split a PDF file into single pages
355pub fn split_into_pages<P: AsRef<Path>>(
356    input_path: P,
357    output_pattern: &str,
358) -> OperationResult<Vec<PathBuf>> {
359    let options = SplitOptions {
360        mode: SplitMode::SinglePages,
361        output_pattern: output_pattern.to_string(),
362        ..Default::default()
363    };
364
365    split_pdf(input_path, options)
366}
367
368#[cfg(test)]
369mod tests {
370    use super::*;
371
372    #[test]
373    fn test_split_options_default() {
374        let options = SplitOptions::default();
375        assert!(matches!(options.mode, SplitMode::SinglePages));
376        assert_eq!(options.output_pattern, "page_{}.pdf");
377        assert!(options.preserve_metadata);
378        assert!(!options.optimize);
379    }
380
381    #[test]
382    fn test_format_output_path() {
383        let options = SplitOptions {
384            output_pattern: "output_page_{}.pdf".to_string(),
385            ..Default::default()
386        };
387
388        let reader = PdfReader::open("test.pdf");
389        // Note: This test would need a valid PDF file to work properly
390        // For now, we're just testing the logic
391    }
392}