1use super::{OperationError, OperationResult, PageRange};
7use crate::parser::page_tree::ParsedPage;
8use crate::parser::{ContentOperation, ContentParser, PdfDocument, PdfReader};
9use crate::{Document, Page};
10use std::fs::File;
11use std::path::{Path, PathBuf};
12
13#[derive(Debug, Clone)]
15pub struct SplitOptions {
16 pub mode: SplitMode,
18 pub output_pattern: String,
20 pub preserve_metadata: bool,
22 pub optimize: bool,
24}
25
26impl Default for SplitOptions {
27 fn default() -> Self {
28 Self {
29 mode: SplitMode::SinglePages,
30 output_pattern: "page_{}.pdf".to_string(),
31 preserve_metadata: true,
32 optimize: false,
33 }
34 }
35}
36
37#[derive(Debug, Clone)]
39pub enum SplitMode {
40 SinglePages,
42 Ranges(Vec<PageRange>),
44 ChunkSize(usize),
46 SplitAt(Vec<usize>),
48}
49
50pub struct PdfSplitter {
52 document: PdfDocument<File>,
53 options: SplitOptions,
54}
55
56impl PdfSplitter {
57 pub fn new(document: PdfDocument<File>, options: SplitOptions) -> Self {
59 Self { document, options }
60 }
61
62 pub fn split(&mut self) -> OperationResult<Vec<PathBuf>> {
64 let total_pages =
65 self.document
66 .page_count()
67 .map_err(|e| OperationError::ParseError(e.to_string()))? as usize;
68
69 if total_pages == 0 {
70 return Err(OperationError::NoPagesToProcess);
71 }
72
73 let ranges = match &self.options.mode {
74 SplitMode::SinglePages => {
75 (0..total_pages).map(PageRange::Single).collect()
77 }
78 SplitMode::Ranges(ranges) => ranges.clone(),
79 SplitMode::ChunkSize(size) => {
80 let mut ranges = Vec::new();
82 let mut start = 0;
83 while start < total_pages {
84 let end = (start + size - 1).min(total_pages - 1);
85 ranges.push(PageRange::Range(start, end));
86 start += size;
87 }
88 ranges
89 }
90 SplitMode::SplitAt(split_points) => {
91 let mut ranges = Vec::new();
93 let mut start = 0;
94
95 for &split_point in split_points {
96 if split_point > 0 && split_point < total_pages {
97 ranges.push(PageRange::Range(start, split_point - 1));
98 start = split_point;
99 }
100 }
101
102 if start < total_pages {
104 ranges.push(PageRange::Range(start, total_pages - 1));
105 }
106
107 ranges
108 }
109 };
110
111 let mut output_files = Vec::new();
113
114 for (index, range) in ranges.iter().enumerate() {
115 let output_path = self.format_output_path(index, range);
116 self.extract_range(range, &output_path)?;
117 output_files.push(output_path);
118 }
119
120 Ok(output_files)
121 }
122
123 fn extract_range(&mut self, range: &PageRange, output_path: &Path) -> OperationResult<()> {
125 let total_pages =
126 self.document
127 .page_count()
128 .map_err(|e| OperationError::ParseError(e.to_string()))? as usize;
129
130 let indices = range.get_indices(total_pages)?;
131 if indices.is_empty() {
132 return Err(OperationError::NoPagesToProcess);
133 }
134
135 let mut doc = Document::new();
137
138 if self.options.preserve_metadata {
140 if let Ok(metadata) = self.document.metadata() {
141 if let Some(title) = metadata.title {
142 doc.set_title(&title);
143 }
144 if let Some(author) = metadata.author {
145 doc.set_author(&author);
146 }
147 if let Some(subject) = metadata.subject {
148 doc.set_subject(&subject);
149 }
150 if let Some(keywords) = metadata.keywords {
151 doc.set_keywords(&keywords);
152 }
153 }
154 }
155
156 for &page_idx in &indices {
158 let parsed_page = self
159 .document
160 .get_page(page_idx as u32)
161 .map_err(|e| OperationError::ParseError(e.to_string()))?;
162
163 let page = self.convert_page(&parsed_page)?;
164 doc.add_page(page);
165 }
166
167 doc.save(output_path)?;
169
170 Ok(())
171 }
172
173 fn convert_page(&mut self, parsed_page: &ParsedPage) -> OperationResult<Page> {
175 let width = parsed_page.width();
177 let height = parsed_page.height();
178 let mut page = Page::new(width, height);
179
180 if parsed_page.rotation != 0 {
182 }
185
186 let content_streams = self
188 .document
189 .get_page_content_streams(parsed_page)
190 .map_err(|e| OperationError::ParseError(e.to_string()))?;
191
192 let mut has_content = false;
194 for stream_data in &content_streams {
195 match ContentParser::parse_content(stream_data) {
196 Ok(operators) => {
197 self.process_operators(&mut page, &operators)?;
199 has_content = true;
200 }
201 Err(e) => {
202 eprintln!("Warning: Failed to parse content stream: {e}");
204 }
205 }
206 }
207
208 if !has_content {
210 page.text()
211 .set_font(crate::text::Font::Helvetica, 10.0)
212 .at(50.0, height - 50.0)
213 .write("[Page extracted - content reconstruction in progress]")
214 .map_err(OperationError::PdfError)?;
215 }
216
217 Ok(page)
218 }
219
220 fn process_operators(
222 &self,
223 page: &mut Page,
224 operators: &[ContentOperation],
225 ) -> OperationResult<()> {
226 let mut text_object = false;
228 let mut current_font = crate::text::Font::Helvetica;
229 let mut current_font_size = 12.0;
230 let mut current_x = 0.0;
231 let mut current_y = 0.0;
232
233 for operator in operators {
234 match operator {
235 ContentOperation::BeginText => {
236 text_object = true;
237 }
238 ContentOperation::EndText => {
239 text_object = false;
240 }
241 ContentOperation::SetFont(name, size) => {
242 current_font = match name.as_str() {
244 "Times-Roman" => crate::text::Font::TimesRoman,
245 "Times-Bold" => crate::text::Font::TimesBold,
246 "Times-Italic" => crate::text::Font::TimesItalic,
247 "Times-BoldItalic" => crate::text::Font::TimesBoldItalic,
248 "Helvetica-Bold" => crate::text::Font::HelveticaBold,
249 "Helvetica-Oblique" => crate::text::Font::HelveticaOblique,
250 "Helvetica-BoldOblique" => crate::text::Font::HelveticaBoldOblique,
251 "Courier" => crate::text::Font::Courier,
252 "Courier-Bold" => crate::text::Font::CourierBold,
253 "Courier-Oblique" => crate::text::Font::CourierOblique,
254 "Courier-BoldOblique" => crate::text::Font::CourierBoldOblique,
255 _ => crate::text::Font::Helvetica, };
257 current_font_size = *size;
258 }
259 ContentOperation::MoveText(tx, ty) => {
260 current_x += tx;
261 current_y += ty;
262 }
263 ContentOperation::ShowText(text_bytes) => {
264 if text_object {
265 if let Ok(text) = String::from_utf8(text_bytes.clone()) {
267 page.text()
268 .set_font(current_font, current_font_size as f64)
269 .at(current_x as f64, current_y as f64)
270 .write(&text)
271 .map_err(OperationError::PdfError)?;
272 }
273 }
274 }
275 ContentOperation::Rectangle(x, y, width, height) => {
276 page.graphics()
277 .rect(*x as f64, *y as f64, *width as f64, *height as f64);
278 }
279 ContentOperation::MoveTo(x, y) => {
280 page.graphics().move_to(*x as f64, *y as f64);
281 }
282 ContentOperation::LineTo(x, y) => {
283 page.graphics().line_to(*x as f64, *y as f64);
284 }
285 ContentOperation::Stroke => {
286 page.graphics().stroke();
287 }
288 ContentOperation::Fill => {
289 page.graphics().fill();
290 }
291 ContentOperation::SetNonStrokingRGB(r, g, b) => {
292 page.graphics().set_fill_color(crate::graphics::Color::Rgb(
293 *r as f64, *g as f64, *b as f64,
294 ));
295 }
296 ContentOperation::SetStrokingRGB(r, g, b) => {
297 page.graphics()
298 .set_stroke_color(crate::graphics::Color::Rgb(
299 *r as f64, *g as f64, *b as f64,
300 ));
301 }
302 ContentOperation::SetLineWidth(width) => {
303 page.graphics().set_line_width(*width as f64);
304 }
305 _ => {
307 }
309 }
310 }
311
312 Ok(())
313 }
314
315 fn format_output_path(&self, index: usize, range: &PageRange) -> PathBuf {
317 let filename = match range {
318 PageRange::Single(page) => self
319 .options
320 .output_pattern
321 .replace("{}", &(page + 1).to_string())
322 .replace("{n}", &(index + 1).to_string())
323 .replace("{page}", &(page + 1).to_string()),
324 PageRange::Range(start, end) => self
325 .options
326 .output_pattern
327 .replace("{}", &format!("{}-{}", start + 1, end + 1))
328 .replace("{n}", &(index + 1).to_string())
329 .replace("{start}", &(start + 1).to_string())
330 .replace("{end}", &(end + 1).to_string()),
331 _ => self
332 .options
333 .output_pattern
334 .replace("{}", &(index + 1).to_string())
335 .replace("{n}", &(index + 1).to_string()),
336 };
337
338 PathBuf::from(filename)
339 }
340}
341
342pub fn split_pdf<P: AsRef<Path>>(
344 input_path: P,
345 options: SplitOptions,
346) -> OperationResult<Vec<PathBuf>> {
347 let document = PdfReader::open_document(input_path)
348 .map_err(|e| OperationError::ParseError(e.to_string()))?;
349
350 let mut splitter = PdfSplitter::new(document, options);
351 splitter.split()
352}
353
354pub fn split_into_pages<P: AsRef<Path>>(
356 input_path: P,
357 output_pattern: &str,
358) -> OperationResult<Vec<PathBuf>> {
359 let options = SplitOptions {
360 mode: SplitMode::SinglePages,
361 output_pattern: output_pattern.to_string(),
362 ..Default::default()
363 };
364
365 split_pdf(input_path, options)
366}
367
368#[cfg(test)]
369mod tests {
370 use super::*;
371
372 #[test]
373 fn test_split_options_default() {
374 let options = SplitOptions::default();
375 assert!(matches!(options.mode, SplitMode::SinglePages));
376 assert_eq!(options.output_pattern, "page_{}.pdf");
377 assert!(options.preserve_metadata);
378 assert!(!options.optimize);
379 }
380
381 #[test]
382 fn test_format_output_path() {
383 let options = SplitOptions {
384 output_pattern: "output_page_{}.pdf".to_string(),
385 ..Default::default()
386 };
387
388 let reader = PdfReader::open("test.pdf");
389 }
392}