1use super::{OperationError, OperationResult, PageRange};
7use crate::parser::page_tree::ParsedPage;
8use crate::parser::{ContentOperation, ContentParser, PdfDocument, PdfReader};
9use crate::{Document, Page};
10use std::fs::File;
11use std::path::{Path, PathBuf};
12
13#[derive(Debug, Clone)]
15pub struct SplitOptions {
16 pub mode: SplitMode,
18 pub output_pattern: String,
20 pub preserve_metadata: bool,
22 pub optimize: bool,
24}
25
26impl Default for SplitOptions {
27 fn default() -> Self {
28 Self {
29 mode: SplitMode::SinglePages,
30 output_pattern: "page_{}.pdf".to_string(),
31 preserve_metadata: true,
32 optimize: false,
33 }
34 }
35}
36
37#[derive(Debug, Clone)]
39pub enum SplitMode {
40 SinglePages,
42 Ranges(Vec<PageRange>),
44 ChunkSize(usize),
46 SplitAt(Vec<usize>),
48}
49
50pub struct PdfSplitter {
52 document: PdfDocument<File>,
53 options: SplitOptions,
54}
55
56impl PdfSplitter {
57 pub fn new(document: PdfDocument<File>, options: SplitOptions) -> Self {
59 Self { document, options }
60 }
61
62 pub fn split(&mut self) -> OperationResult<Vec<PathBuf>> {
64 let total_pages =
65 self.document
66 .page_count()
67 .map_err(|e| OperationError::ParseError(e.to_string()))? as usize;
68
69 if total_pages == 0 {
70 return Err(OperationError::NoPagesToProcess);
71 }
72
73 let ranges = match &self.options.mode {
74 SplitMode::SinglePages => {
75 (0..total_pages).map(PageRange::Single).collect()
77 }
78 SplitMode::Ranges(ranges) => ranges.clone(),
79 SplitMode::ChunkSize(size) => {
80 let mut ranges = Vec::new();
82 let mut start = 0;
83 while start < total_pages {
84 let end = (start + size - 1).min(total_pages - 1);
85 ranges.push(PageRange::Range(start, end));
86 start += size;
87 }
88 ranges
89 }
90 SplitMode::SplitAt(split_points) => {
91 let mut ranges = Vec::new();
93 let mut start = 0;
94
95 for &split_point in split_points {
96 if split_point > 0 && split_point < total_pages {
97 ranges.push(PageRange::Range(start, split_point - 1));
98 start = split_point;
99 }
100 }
101
102 if start < total_pages {
104 ranges.push(PageRange::Range(start, total_pages - 1));
105 }
106
107 ranges
108 }
109 };
110
111 let mut output_files = Vec::new();
113
114 for (index, range) in ranges.iter().enumerate() {
115 let output_path = self.format_output_path(index, range);
116 self.extract_range(range, &output_path)?;
117 output_files.push(output_path);
118 }
119
120 Ok(output_files)
121 }
122
123 fn extract_range(&mut self, range: &PageRange, output_path: &Path) -> OperationResult<()> {
125 let total_pages =
126 self.document
127 .page_count()
128 .map_err(|e| OperationError::ParseError(e.to_string()))? as usize;
129
130 let indices = range.get_indices(total_pages)?;
131 if indices.is_empty() {
132 return Err(OperationError::NoPagesToProcess);
133 }
134
135 let mut doc = Document::new();
137
138 if self.options.preserve_metadata {
140 if let Ok(metadata) = self.document.metadata() {
141 if let Some(title) = metadata.title {
142 doc.set_title(&title);
143 }
144 if let Some(author) = metadata.author {
145 doc.set_author(&author);
146 }
147 if let Some(subject) = metadata.subject {
148 doc.set_subject(&subject);
149 }
150 if let Some(keywords) = metadata.keywords {
151 doc.set_keywords(&keywords);
152 }
153 }
154 }
155
156 for &page_idx in &indices {
158 let parsed_page = self
159 .document
160 .get_page(page_idx as u32)
161 .map_err(|e| OperationError::ParseError(e.to_string()))?;
162
163 let page = self.convert_page(&parsed_page)?;
164 doc.add_page(page);
165 }
166
167 doc.save(output_path)?;
169
170 Ok(())
171 }
172
173 fn convert_page(&mut self, parsed_page: &ParsedPage) -> OperationResult<Page> {
175 let width = parsed_page.width();
177 let height = parsed_page.height();
178 let mut page = Page::new(width, height);
179
180 if parsed_page.rotation != 0 {
182 page.set_rotation(parsed_page.rotation);
183 }
184
185 let content_streams = self
187 .document
188 .get_page_content_streams(parsed_page)
189 .map_err(|e| OperationError::ParseError(e.to_string()))?;
190
191 let mut has_content = false;
193 for stream_data in &content_streams {
194 match ContentParser::parse_content(stream_data) {
195 Ok(operators) => {
196 self.process_operators(&mut page, &operators)?;
198 has_content = true;
199 }
200 Err(e) => {
201 eprintln!("Warning: Failed to parse content stream: {e}");
203 }
204 }
205 }
206
207 if !has_content {
209 page.text()
210 .set_font(crate::text::Font::Helvetica, 10.0)
211 .at(50.0, height - 50.0)
212 .write("[Page extracted - content reconstruction in progress]")
213 .map_err(OperationError::PdfError)?;
214 }
215
216 Ok(page)
217 }
218
219 fn process_operators(
221 &self,
222 page: &mut Page,
223 operators: &[ContentOperation],
224 ) -> OperationResult<()> {
225 let mut text_object = false;
227 let mut current_font = crate::text::Font::Helvetica;
228 let mut current_font_size = 12.0;
229 let mut current_x = 0.0;
230 let mut current_y = 0.0;
231
232 for operator in operators {
233 match operator {
234 ContentOperation::BeginText => {
235 text_object = true;
236 }
237 ContentOperation::EndText => {
238 text_object = false;
239 }
240 ContentOperation::SetFont(name, size) => {
241 current_font = match name.as_str() {
243 "Times-Roman" => crate::text::Font::TimesRoman,
244 "Times-Bold" => crate::text::Font::TimesBold,
245 "Times-Italic" => crate::text::Font::TimesItalic,
246 "Times-BoldItalic" => crate::text::Font::TimesBoldItalic,
247 "Helvetica-Bold" => crate::text::Font::HelveticaBold,
248 "Helvetica-Oblique" => crate::text::Font::HelveticaOblique,
249 "Helvetica-BoldOblique" => crate::text::Font::HelveticaBoldOblique,
250 "Courier" => crate::text::Font::Courier,
251 "Courier-Bold" => crate::text::Font::CourierBold,
252 "Courier-Oblique" => crate::text::Font::CourierOblique,
253 "Courier-BoldOblique" => crate::text::Font::CourierBoldOblique,
254 _ => crate::text::Font::Helvetica, };
256 current_font_size = *size;
257 }
258 ContentOperation::MoveText(tx, ty) => {
259 current_x += tx;
260 current_y += ty;
261 }
262 ContentOperation::ShowText(text_bytes) => {
263 if text_object {
264 if let Ok(text) = String::from_utf8(text_bytes.clone()) {
266 page.text()
267 .set_font(current_font.clone(), current_font_size as f64)
268 .at(current_x as f64, current_y as f64)
269 .write(&text)
270 .map_err(OperationError::PdfError)?;
271 }
272 }
273 }
274 ContentOperation::Rectangle(x, y, width, height) => {
275 page.graphics()
276 .rect(*x as f64, *y as f64, *width as f64, *height as f64);
277 }
278 ContentOperation::MoveTo(x, y) => {
279 page.graphics().move_to(*x as f64, *y as f64);
280 }
281 ContentOperation::LineTo(x, y) => {
282 page.graphics().line_to(*x as f64, *y as f64);
283 }
284 ContentOperation::Stroke => {
285 page.graphics().stroke();
286 }
287 ContentOperation::Fill => {
288 page.graphics().fill();
289 }
290 ContentOperation::SetNonStrokingRGB(r, g, b) => {
291 page.graphics().set_fill_color(crate::graphics::Color::Rgb(
292 *r as f64, *g as f64, *b as f64,
293 ));
294 }
295 ContentOperation::SetStrokingRGB(r, g, b) => {
296 page.graphics()
297 .set_stroke_color(crate::graphics::Color::Rgb(
298 *r as f64, *g as f64, *b as f64,
299 ));
300 }
301 ContentOperation::SetLineWidth(width) => {
302 page.graphics().set_line_width(*width as f64);
303 }
304 _ => {
306 }
308 }
309 }
310
311 Ok(())
312 }
313
314 fn format_output_path(&self, index: usize, range: &PageRange) -> PathBuf {
316 let filename = match range {
317 PageRange::Single(page) => self
318 .options
319 .output_pattern
320 .replace("{}", &(page + 1).to_string())
321 .replace("{n}", &(index + 1).to_string())
322 .replace("{page}", &(page + 1).to_string()),
323 PageRange::Range(start, end) => self
324 .options
325 .output_pattern
326 .replace("{}", &format!("{}-{}", start + 1, end + 1))
327 .replace("{n}", &(index + 1).to_string())
328 .replace("{start}", &(start + 1).to_string())
329 .replace("{end}", &(end + 1).to_string()),
330 _ => self
331 .options
332 .output_pattern
333 .replace("{}", &(index + 1).to_string())
334 .replace("{n}", &(index + 1).to_string()),
335 };
336
337 PathBuf::from(filename)
338 }
339}
340
341pub fn split_pdf<P: AsRef<Path>>(
343 input_path: P,
344 options: SplitOptions,
345) -> OperationResult<Vec<PathBuf>> {
346 let document = PdfReader::open_document(input_path)
347 .map_err(|e| OperationError::ParseError(e.to_string()))?;
348
349 let mut splitter = PdfSplitter::new(document, options);
350 splitter.split()
351}
352
353pub fn split_into_pages<P: AsRef<Path>>(
355 input_path: P,
356 output_pattern: &str,
357) -> OperationResult<Vec<PathBuf>> {
358 let options = SplitOptions {
359 mode: SplitMode::SinglePages,
360 output_pattern: output_pattern.to_string(),
361 ..Default::default()
362 };
363
364 split_pdf(input_path, options)
365}
366
367#[cfg(test)]
368mod tests {
369 use super::*;
370
371 #[test]
372 fn test_split_options_default() {
373 let options = SplitOptions::default();
374 assert!(matches!(options.mode, SplitMode::SinglePages));
375 assert_eq!(options.output_pattern, "page_{}.pdf");
376 assert!(options.preserve_metadata);
377 assert!(!options.optimize);
378 }
379
380 #[test]
381 fn test_format_output_path() {
382 let _options = SplitOptions {
383 output_pattern: "output_page_{}.pdf".to_string(),
384 ..Default::default()
385 };
386
387 let _reader = PdfReader::open("test.pdf");
388 }
391
392 #[test]
395 fn test_split_mode_variants() {
396 let single_pages = SplitMode::SinglePages;
398 assert!(matches!(single_pages, SplitMode::SinglePages));
399
400 let ranges = SplitMode::Ranges(vec![
402 super::PageRange::Single(0),
403 super::PageRange::Range(5, 10),
404 ]);
405 assert!(matches!(ranges, SplitMode::Ranges(_)));
406
407 let chunk = SplitMode::ChunkSize(5);
409 if let SplitMode::ChunkSize(size) = chunk {
410 assert_eq!(size, 5);
411 } else {
412 panic!("Expected ChunkSize");
413 }
414
415 let split_at = SplitMode::SplitAt(vec![5, 10, 15]);
417 assert!(matches!(split_at, SplitMode::SplitAt(_)));
418 }
419
420 #[test]
421 fn test_split_options_with_modes() {
422 let options = SplitOptions {
423 mode: SplitMode::ChunkSize(10),
424 output_pattern: "chunk_{}.pdf".to_string(),
425 preserve_metadata: true,
426 optimize: true,
427 };
428
429 assert!(matches!(options.mode, SplitMode::ChunkSize(10)));
430 assert_eq!(options.output_pattern, "chunk_{}.pdf");
431 assert!(options.preserve_metadata);
432 assert!(options.optimize);
433 }
434
435 #[test]
436 fn test_split_options_page_range() {
437 let ranges = vec![
438 super::PageRange::All,
439 super::PageRange::Single(5),
440 super::PageRange::Range(10, 20),
441 super::PageRange::List(vec![1, 3, 5, 7, 9]),
442 ];
443
444 let options = SplitOptions {
445 mode: SplitMode::Ranges(ranges),
446 ..Default::default()
447 };
448
449 if let SplitMode::Ranges(r) = options.mode {
450 assert_eq!(r.len(), 4);
451 } else {
452 panic!("Expected Ranges mode");
453 }
454 }
455
456 #[test]
457 fn test_split_options_split_at() {
458 let split_points = vec![3, 6, 9, 12]; let options = SplitOptions {
461 mode: SplitMode::SplitAt(split_points.clone()),
462 output_pattern: "part_{}.pdf".to_string(),
463 ..Default::default()
464 };
465
466 if let SplitMode::SplitAt(points) = options.mode {
467 assert_eq!(points.len(), 4);
468 assert_eq!(points, split_points);
469 } else {
470 panic!("Expected SplitAt mode");
471 }
472 }
473
474 #[test]
475 fn test_output_pattern_formatting() {
476 let patterns = vec![
478 "output_{}.pdf",
479 "page_{}.pdf",
480 "document_part_{}.pdf",
481 "{}_split.pdf",
482 ];
483
484 for pattern in patterns {
485 let options = SplitOptions {
486 output_pattern: pattern.to_string(),
487 ..Default::default()
488 };
489 assert!(options.output_pattern.contains("{")); }
491 }
492
493 #[test]
494 fn test_split_options_preserve_metadata() {
495 let with_metadata = SplitOptions {
497 preserve_metadata: true,
498 ..Default::default()
499 };
500 assert!(with_metadata.preserve_metadata);
501
502 let without_metadata = SplitOptions {
503 preserve_metadata: false,
504 ..Default::default()
505 };
506 assert!(!without_metadata.preserve_metadata);
507 }
508
509 #[test]
510 fn test_split_single_pages_mode() {
511 let options = SplitOptions {
512 mode: SplitMode::SinglePages,
513 output_pattern: "page_{:04}.pdf".to_string(),
514 ..Default::default()
515 };
516
517 assert!(matches!(options.mode, SplitMode::SinglePages));
518 assert!(options.output_pattern.contains("{"));
519 }
520
521 #[test]
522 fn test_split_chunk_size_validation() {
523 let chunk_sizes = vec![1, 5, 10, 50, 100];
525
526 for size in chunk_sizes {
527 let options = SplitOptions {
528 mode: SplitMode::ChunkSize(size),
529 ..Default::default()
530 };
531
532 if let SplitMode::ChunkSize(s) = options.mode {
533 assert_eq!(s, size);
534 assert!(s > 0); }
536 }
537 }
538
539 #[test]
540 fn test_split_options_optimization() {
541 let optimized = SplitOptions {
542 optimize: true,
543 ..Default::default()
544 };
545 assert!(optimized.optimize);
546
547 let not_optimized = SplitOptions {
548 optimize: false,
549 ..Default::default()
550 };
551 assert!(!not_optimized.optimize);
552 }
553
554 #[test]
555 fn test_split_options_with_custom_pattern() {
556 let options = SplitOptions {
557 output_pattern: "document_part_{}.pdf".to_string(),
558 ..Default::default()
559 };
560 assert_eq!(options.output_pattern, "document_part_{}.pdf");
561 }
562
563 #[test]
564 fn test_split_mode_ranges() {
565 let ranges = vec![
566 PageRange::Single(0),
567 PageRange::Range(1, 3),
568 PageRange::Single(5),
569 ];
570 let mode = SplitMode::Ranges(ranges.clone());
571
572 match mode {
573 SplitMode::Ranges(r) => {
574 assert_eq!(r.len(), 3);
575 assert!(matches!(r[0], PageRange::Single(0)));
576 assert!(matches!(r[1], PageRange::Range(1, 3)));
577 assert!(matches!(r[2], PageRange::Single(5)));
578 }
579 _ => panic!("Wrong mode"),
580 }
581 }
582
583 #[test]
584 fn test_split_mode_split_at() {
585 let split_points = vec![5, 10, 15];
586 let mode = SplitMode::SplitAt(split_points.clone());
587
588 match mode {
589 SplitMode::SplitAt(points) => assert_eq!(points, split_points),
590 _ => panic!("Wrong mode"),
591 }
592 }
593
594 #[test]
595 fn test_page_range_parse() {
596 let range = PageRange::parse("all").unwrap();
598 assert!(matches!(range, PageRange::All));
599
600 let range = PageRange::parse("5").unwrap();
602 assert!(matches!(range, PageRange::Single(4))); let range = PageRange::parse("3-7").unwrap();
606 assert!(matches!(range, PageRange::Range(2, 6))); let range = PageRange::parse("1,3,5").unwrap();
610 match range {
611 PageRange::List(pages) => assert_eq!(pages, vec![0, 2, 4]),
612 _ => panic!("Expected List"),
613 }
614 }
615
616 #[test]
617 fn test_page_range_invalid_parse() {
618 assert!(PageRange::parse("").is_err());
619 assert!(PageRange::parse("abc").is_err());
620 assert!(PageRange::parse("5-3").is_err()); assert!(PageRange::parse("0").is_err()); }
623
624 #[test]
625 fn test_split_options_all_fields() {
626 let options = SplitOptions {
627 mode: SplitMode::ChunkSize(5),
628 output_pattern: "chunk_{}.pdf".to_string(),
629 preserve_metadata: false,
630 optimize: true,
631 };
632
633 match options.mode {
634 SplitMode::ChunkSize(size) => assert_eq!(size, 5),
635 _ => panic!("Wrong mode"),
636 }
637 assert_eq!(options.output_pattern, "chunk_{}.pdf");
638 assert!(!options.preserve_metadata);
639 assert!(options.optimize);
640 }
641
642 #[test]
643 fn test_split_mode_chunk_size_edge_cases() {
644 let mode = SplitMode::ChunkSize(1);
646 match mode {
647 SplitMode::ChunkSize(size) => assert_eq!(size, 1),
648 _ => panic!("Wrong mode"),
649 }
650
651 let mode = SplitMode::ChunkSize(1000);
653 match mode {
654 SplitMode::ChunkSize(size) => assert_eq!(size, 1000),
655 _ => panic!("Wrong mode"),
656 }
657 }
658
659 #[test]
660 fn test_split_mode_empty_ranges() {
661 let ranges = Vec::new();
662 let mode = SplitMode::Ranges(ranges);
663
664 match mode {
665 SplitMode::Ranges(r) => assert!(r.is_empty()),
666 _ => panic!("Wrong mode"),
667 }
668 }
669
670 #[test]
671 fn test_split_mode_empty_split_points() {
672 let split_points = Vec::new();
673 let mode = SplitMode::SplitAt(split_points);
674
675 match mode {
676 SplitMode::SplitAt(points) => assert!(points.is_empty()),
677 _ => panic!("Wrong mode"),
678 }
679 }
680}
681
682#[cfg(test)]
683#[path = "split_tests.rs"]
684mod split_tests;