1use crate::parser::content::{ContentOperation, ContentParser};
51use crate::parser::{ParseError, PdfDocument};
52use std::fmt;
53
54#[derive(Debug, Clone, Copy, PartialEq, Eq)]
56pub enum LineOrientation {
57 Horizontal,
59 Vertical,
61 Diagonal,
63}
64
65#[derive(Debug, Clone, PartialEq)]
67pub struct VectorLine {
68 pub x1: f64,
70 pub y1: f64,
72 pub x2: f64,
74 pub y2: f64,
76 pub orientation: LineOrientation,
78 pub stroke_width: f64,
80 pub is_stroked: bool,
82}
83
84impl VectorLine {
85 pub fn new(x1: f64, y1: f64, x2: f64, y2: f64, stroke_width: f64, is_stroked: bool) -> Self {
98 let orientation = Self::compute_orientation(x1, y1, x2, y2);
99 Self {
100 x1,
101 y1,
102 x2,
103 y2,
104 orientation,
105 stroke_width,
106 is_stroked,
107 }
108 }
109
110 fn compute_orientation(x1: f64, y1: f64, x2: f64, y2: f64) -> LineOrientation {
114 const TOLERANCE: f64 = 0.1;
115
116 let dx = (x2 - x1).abs();
117 let dy = (y2 - y1).abs();
118
119 if dy < TOLERANCE {
120 LineOrientation::Horizontal
121 } else if dx < TOLERANCE {
122 LineOrientation::Vertical
123 } else {
124 LineOrientation::Diagonal
125 }
126 }
127
128 pub fn length(&self) -> f64 {
130 let dx = self.x2 - self.x1;
131 let dy = self.y2 - self.y1;
132 (dx * dx + dy * dy).sqrt()
133 }
134
135 pub fn midpoint(&self) -> (f64, f64) {
137 ((self.x1 + self.x2) / 2.0, (self.y1 + self.y2) / 2.0)
138 }
139}
140
141#[derive(Debug, Clone, Default)]
143pub struct ExtractedGraphics {
144 pub lines: Vec<VectorLine>,
146 pub horizontal_count: usize,
148 pub vertical_count: usize,
150}
151
152impl ExtractedGraphics {
153 pub fn new() -> Self {
155 Self::default()
156 }
157
158 pub fn add_line(&mut self, line: VectorLine) {
160 match line.orientation {
161 LineOrientation::Horizontal => self.horizontal_count += 1,
162 LineOrientation::Vertical => self.vertical_count += 1,
163 LineOrientation::Diagonal => {} }
165 self.lines.push(line);
166 }
167
168 pub fn horizontal_lines(&self) -> impl Iterator<Item = &VectorLine> {
170 self.lines
171 .iter()
172 .filter(|l| l.orientation == LineOrientation::Horizontal)
173 }
174
175 pub fn vertical_lines(&self) -> impl Iterator<Item = &VectorLine> {
177 self.lines
178 .iter()
179 .filter(|l| l.orientation == LineOrientation::Vertical)
180 }
181
182 pub fn has_table_structure(&self) -> bool {
186 self.horizontal_count >= 2 && self.vertical_count >= 2
187 }
188}
189
190#[derive(Debug, Clone)]
192pub struct ExtractionConfig {
193 pub min_line_length: f64,
195 pub extract_diagonals: bool,
197 pub stroked_only: bool,
199}
200
201impl Default for ExtractionConfig {
202 fn default() -> Self {
203 Self {
204 min_line_length: 1.0, extract_diagonals: false, stroked_only: true, }
208 }
209}
210
211pub struct GraphicsExtractor {
213 config: ExtractionConfig,
214}
215
216impl GraphicsExtractor {
217 pub fn new(config: ExtractionConfig) -> Self {
219 Self { config }
220 }
221
222 pub fn default() -> Self {
224 Self::new(ExtractionConfig::default())
225 }
226
227 pub fn config(&self) -> &ExtractionConfig {
229 &self.config
230 }
231
232 pub fn extract_from_page<R: std::io::Read + std::io::Seek>(
247 &mut self,
248 document: &PdfDocument<R>,
249 page_index: usize,
250 ) -> Result<ExtractedGraphics, ExtractionError> {
251 let page = document
253 .get_page(page_index as u32)
254 .map_err(|e| ExtractionError::ParseError(format!("Failed to get page: {}", e)))?;
255
256 let streams = document
258 .get_page_content_streams(&page)
259 .map_err(|e| ExtractionError::ParseError(format!("Failed to get content: {}", e)))?;
260
261 let mut graphics = ExtractedGraphics::new();
262 let mut state = GraphicsState::new();
263
264 for stream in streams {
266 let operations = ContentParser::parse(&stream).map_err(|e| {
267 ExtractionError::ParseError(format!("Failed to parse content: {}", e))
268 })?;
269
270 self.process_operations(&operations, &mut state, &mut graphics)?;
271 }
272
273 Ok(graphics)
274 }
275
276 fn process_operations(
278 &self,
279 operations: &[ContentOperation],
280 state: &mut GraphicsState,
281 graphics: &mut ExtractedGraphics,
282 ) -> Result<(), ExtractionError> {
283 for op in operations {
284 match op {
285 ContentOperation::SaveGraphicsState => state.save(),
287 ContentOperation::RestoreGraphicsState => state.restore(),
288 ContentOperation::SetLineWidth(w) => state.stroke_width = *w as f64,
289 ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
290 state.apply_transform(
291 *a as f64, *b as f64, *c as f64, *d as f64, *e as f64, *f as f64,
292 );
293 }
294
295 ContentOperation::MoveTo(x, y) => {
297 let (tx, ty) = state.transform_point(*x as f64, *y as f64);
298 state.move_to(tx, ty);
299 }
300 ContentOperation::LineTo(x, y) => {
301 let (tx, ty) = state.transform_point(*x as f64, *y as f64);
302 state.line_to(tx, ty);
303 }
304 ContentOperation::Rectangle(x, y, width, height) => {
305 self.extract_rectangle_lines(
306 *x as f64,
307 *y as f64,
308 *width as f64,
309 *height as f64,
310 state,
311 graphics,
312 );
313 }
314 ContentOperation::ClosePath => {
315 state.close_path();
316 }
317
318 ContentOperation::Stroke | ContentOperation::CloseStroke => {
320 self.extract_path_lines(state, graphics, true);
321 state.clear_path();
322 }
323 ContentOperation::Fill | ContentOperation::FillEvenOdd => {
324 if !self.config.stroked_only {
325 self.extract_path_lines(state, graphics, false);
326 }
327 state.clear_path();
328 }
329
330 _ => {} }
332 }
333
334 Ok(())
335 }
336
337 fn extract_rectangle_lines(
341 &self,
342 x: f64,
343 y: f64,
344 width: f64,
345 height: f64,
346 state: &GraphicsState,
347 graphics: &mut ExtractedGraphics,
348 ) {
349 let stroke_width = state.stroke_width;
350
351 let (x1, y1) = state.transform_point(x, y); let (x2, y2) = state.transform_point(x + width, y); let (x3, y3) = state.transform_point(x + width, y + height); let (x4, y4) = state.transform_point(x, y + height); graphics.add_line(VectorLine::new(x1, y1, x2, y2, stroke_width, true));
359
360 graphics.add_line(VectorLine::new(x2, y2, x3, y3, stroke_width, true));
362
363 graphics.add_line(VectorLine::new(x3, y3, x4, y4, stroke_width, true));
365
366 graphics.add_line(VectorLine::new(x4, y4, x1, y1, stroke_width, true));
368 }
369
370 fn extract_path_lines(
372 &self,
373 state: &GraphicsState,
374 graphics: &mut ExtractedGraphics,
375 is_stroked: bool,
376 ) {
377 let stroke_width = state.stroke_width;
378
379 for segment in &state.path {
380 let PathSegment::Line { x1, y1, x2, y2 } = segment;
381 let line = VectorLine::new(*x1, *y1, *x2, *y2, stroke_width, is_stroked);
382
383 if self.config.stroked_only && !is_stroked {
385 continue;
386 }
387
388 if line.length() < self.config.min_line_length {
389 continue;
390 }
391
392 if !self.config.extract_diagonals && line.orientation == LineOrientation::Diagonal {
393 continue;
394 }
395
396 graphics.add_line(line);
397 }
398 }
399}
400
401struct GraphicsState {
403 ctm: [f64; 6],
405 stroke_width: f64,
407 path: Vec<PathSegment>,
409 current_point: Option<(f64, f64)>,
411 state_stack: Vec<SavedState>,
413}
414
415#[derive(Clone)]
417struct SavedState {
418 ctm: [f64; 6],
419 stroke_width: f64,
420}
421
422#[derive(Debug, Clone)]
424enum PathSegment {
425 Line { x1: f64, y1: f64, x2: f64, y2: f64 },
426}
427
428impl GraphicsState {
429 fn new() -> Self {
430 Self {
431 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0], stroke_width: 1.0,
433 path: Vec::new(),
434 current_point: None,
435 state_stack: Vec::new(),
436 }
437 }
438
439 fn save(&mut self) {
440 self.state_stack.push(SavedState {
441 ctm: self.ctm,
442 stroke_width: self.stroke_width,
443 });
444 }
445
446 fn restore(&mut self) {
447 if let Some(saved) = self.state_stack.pop() {
448 self.ctm = saved.ctm;
449 self.stroke_width = saved.stroke_width;
450 }
451 }
452
453 fn apply_transform(&mut self, a: f64, b: f64, c: f64, d: f64, e: f64, f: f64) {
454 let [a0, b0, c0, d0, e0, f0] = self.ctm;
455 self.ctm = [
456 a * a0 + b * c0,
457 a * b0 + b * d0,
458 c * a0 + d * c0,
459 c * b0 + d * d0,
460 e * a0 + f * c0 + e0,
461 e * b0 + f * d0 + f0,
462 ];
463 }
464
465 fn transform_point(&self, x: f64, y: f64) -> (f64, f64) {
469 let [a, b, c, d, e, f] = self.ctm;
470 let tx = a * x + c * y + e;
471 let ty = b * x + d * y + f;
472 (tx, ty)
473 }
474
475 fn move_to(&mut self, x: f64, y: f64) {
476 self.current_point = Some((x, y));
477 }
478
479 fn line_to(&mut self, x: f64, y: f64) {
480 if let Some((x1, y1)) = self.current_point {
481 self.path.push(PathSegment::Line {
482 x1,
483 y1,
484 x2: x,
485 y2: y,
486 });
487 self.current_point = Some((x, y));
488 }
489 }
490
491 fn close_path(&mut self) {
492 if let Some((start_x, start_y)) = self.path.first().map(|seg| match seg {
494 PathSegment::Line { x1, y1, .. } => (*x1, *y1),
495 }) {
496 if let Some((x, y)) = self.current_point {
497 const EPSILON: f64 = 0.01;
499 if (x - start_x).abs() > EPSILON || (y - start_y).abs() > EPSILON {
500 self.path.push(PathSegment::Line {
501 x1: x,
502 y1: y,
503 x2: start_x,
504 y2: start_y,
505 });
506 self.current_point = Some((start_x, start_y));
507 }
508 }
509 }
510 }
511
512 fn clear_path(&mut self) {
513 self.path.clear();
514 self.current_point = None;
515 }
516}
517
518#[derive(Debug)]
520pub enum ExtractionError {
521 InvalidOperator(String),
523 InvalidOperand(String),
525 IoError(std::io::Error),
527 ParseError(String),
529}
530
531impl fmt::Display for ExtractionError {
532 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
533 match self {
534 Self::InvalidOperator(op) => write!(f, "Invalid graphics operator: {}", op),
535 Self::InvalidOperand(msg) => write!(f, "Invalid operand: {}", msg),
536 Self::IoError(e) => write!(f, "I/O error: {}", e),
537 Self::ParseError(msg) => write!(f, "Parse error: {}", msg),
538 }
539 }
540}
541
542impl std::error::Error for ExtractionError {
543 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
544 match self {
545 Self::IoError(e) => Some(e),
546 _ => None,
547 }
548 }
549}
550
551impl From<std::io::Error> for ExtractionError {
552 fn from(err: std::io::Error) -> Self {
553 Self::IoError(err)
554 }
555}
556
557impl From<ParseError> for ExtractionError {
558 fn from(err: ParseError) -> Self {
559 Self::ParseError(format!("{}", err))
560 }
561}
562
563#[cfg(test)]
564mod tests {
565 use super::*;
566
567 #[test]
568 fn test_line_orientation_horizontal() {
569 let line = VectorLine::new(100.0, 200.0, 300.0, 200.0, 1.0, true);
570 assert_eq!(line.orientation, LineOrientation::Horizontal);
571 }
572
573 #[test]
574 fn test_line_orientation_vertical() {
575 let line = VectorLine::new(100.0, 200.0, 100.0, 400.0, 1.0, true);
576 assert_eq!(line.orientation, LineOrientation::Vertical);
577 }
578
579 #[test]
580 fn test_line_orientation_diagonal() {
581 let line = VectorLine::new(100.0, 200.0, 300.0, 400.0, 1.0, true);
582 assert_eq!(line.orientation, LineOrientation::Diagonal);
583 }
584
585 #[test]
586 fn test_line_orientation_tolerance() {
587 let line = VectorLine::new(100.0, 200.0, 300.0, 200.05, 1.0, true);
589 assert_eq!(line.orientation, LineOrientation::Horizontal);
590
591 let line = VectorLine::new(100.0, 200.0, 100.05, 400.0, 1.0, true);
593 assert_eq!(line.orientation, LineOrientation::Vertical);
594 }
595
596 #[test]
597 fn test_line_length() {
598 let line = VectorLine::new(0.0, 0.0, 3.0, 4.0, 1.0, true);
599 assert!((line.length() - 5.0).abs() < 0.001); }
601
602 #[test]
603 fn test_line_midpoint() {
604 let line = VectorLine::new(100.0, 200.0, 300.0, 400.0, 1.0, true);
605 let (mx, my) = line.midpoint();
606 assert!((mx - 200.0).abs() < 0.001);
607 assert!((my - 300.0).abs() < 0.001);
608 }
609
610 #[test]
611 fn test_extracted_graphics_add_line() {
612 let mut graphics = ExtractedGraphics::new();
613
614 graphics.add_line(VectorLine::new(0.0, 0.0, 100.0, 0.0, 1.0, true)); graphics.add_line(VectorLine::new(0.0, 0.0, 0.0, 100.0, 1.0, true)); graphics.add_line(VectorLine::new(0.0, 0.0, 100.0, 100.0, 1.0, true)); assert_eq!(graphics.horizontal_count, 1);
619 assert_eq!(graphics.vertical_count, 1);
620 assert_eq!(graphics.lines.len(), 3);
621 }
622
623 #[test]
624 fn test_extracted_graphics_iterators() {
625 let mut graphics = ExtractedGraphics::new();
626
627 graphics.add_line(VectorLine::new(0.0, 0.0, 100.0, 0.0, 1.0, true)); graphics.add_line(VectorLine::new(0.0, 0.0, 0.0, 100.0, 1.0, true)); graphics.add_line(VectorLine::new(0.0, 100.0, 100.0, 100.0, 1.0, true)); assert_eq!(graphics.horizontal_lines().count(), 2);
632 assert_eq!(graphics.vertical_lines().count(), 1);
633 }
634
635 #[test]
636 fn test_has_table_structure() {
637 let mut graphics = ExtractedGraphics::new();
638
639 assert!(!graphics.has_table_structure());
641
642 graphics.add_line(VectorLine::new(0.0, 0.0, 100.0, 0.0, 1.0, true));
644 graphics.add_line(VectorLine::new(0.0, 100.0, 100.0, 100.0, 1.0, true));
645 graphics.add_line(VectorLine::new(0.0, 0.0, 0.0, 100.0, 1.0, true));
646 assert!(!graphics.has_table_structure());
647
648 graphics.add_line(VectorLine::new(100.0, 0.0, 100.0, 100.0, 1.0, true));
650 assert!(graphics.has_table_structure());
651 }
652
653 #[test]
654 fn test_extraction_config_default() {
655 let config = ExtractionConfig::default();
656 assert_eq!(config.min_line_length, 1.0);
657 assert!(!config.extract_diagonals);
658 assert!(config.stroked_only);
659 }
660
661 #[test]
663 fn test_ctm_transform_point_identity() {
664 let state = GraphicsState::new();
665 let (tx, ty) = state.transform_point(100.0, 200.0);
666 assert!((tx - 100.0).abs() < 0.001);
667 assert!((ty - 200.0).abs() < 0.001);
668 }
669
670 #[test]
671 fn test_ctm_transform_point_translation() {
672 let mut state = GraphicsState::new();
673 state.apply_transform(1.0, 0.0, 0.0, 1.0, 50.0, 75.0);
675
676 let (tx, ty) = state.transform_point(100.0, 200.0);
677 assert!((tx - 150.0).abs() < 0.001); assert!((ty - 275.0).abs() < 0.001); }
680
681 #[test]
682 fn test_ctm_transform_point_scale() {
683 let mut state = GraphicsState::new();
684 state.apply_transform(2.0, 0.0, 0.0, 2.0, 0.0, 0.0);
686
687 let (tx, ty) = state.transform_point(100.0, 200.0);
688 assert!((tx - 200.0).abs() < 0.001); assert!((ty - 400.0).abs() < 0.001); }
691
692 #[test]
693 fn test_ctm_transform_point_combined() {
694 let mut state = GraphicsState::new();
695 state.apply_transform(2.0, 0.0, 0.0, 2.0, 10.0, 20.0);
697
698 let (tx, ty) = state.transform_point(5.0, 5.0);
699 assert!((tx - 20.0).abs() < 0.001); assert!((ty - 30.0).abs() < 0.001); }
702
703 #[test]
704 fn test_graphics_state_save_restore() {
705 let mut state = GraphicsState::new();
706 state.stroke_width = 2.0;
707 state.apply_transform(2.0, 0.0, 0.0, 2.0, 10.0, 20.0);
708
709 state.save();
710 state.stroke_width = 5.0;
711 state.apply_transform(1.0, 0.0, 0.0, 1.0, 50.0, 50.0);
712
713 state.restore();
714 assert_eq!(state.stroke_width, 2.0);
715
716 let (tx, ty) = state.transform_point(5.0, 5.0);
718 assert!((tx - 20.0).abs() < 0.001);
719 assert!((ty - 30.0).abs() < 0.001);
720 }
721
722 #[test]
723 fn test_graphics_state_nested_save_restore() {
724 let mut state = GraphicsState::new();
725 state.stroke_width = 2.0;
726
727 state.save();
728 state.stroke_width = 5.0;
729
730 state.save();
731 state.stroke_width = 10.0;
732
733 state.restore();
734 assert_eq!(state.stroke_width, 5.0);
735
736 state.restore();
737 assert_eq!(state.stroke_width, 2.0);
738
739 state.restore();
741 assert_eq!(state.stroke_width, 2.0);
742 }
743
744 #[test]
745 fn test_close_path_creates_closing_line() {
746 let mut state = GraphicsState::new();
747
748 state.move_to(100.0, 100.0);
750 state.line_to(200.0, 100.0);
751 state.line_to(200.0, 200.0);
752 state.close_path();
753
754 assert_eq!(state.path.len(), 3);
756
757 let PathSegment::Line { x1, y1, x2, y2 } = &state.path[2];
759 assert!((*x1 - 200.0).abs() < 0.01);
760 assert!((*y1 - 200.0).abs() < 0.01);
761 assert!((*x2 - 100.0).abs() < 0.01);
762 assert!((*y2 - 100.0).abs() < 0.01);
763 }
764
765 #[test]
766 fn test_close_path_no_duplicate_if_already_closed() {
767 let mut state = GraphicsState::new();
768
769 state.move_to(100.0, 100.0);
771 state.line_to(200.0, 100.0);
772 state.line_to(200.0, 200.0);
773 state.line_to(100.0, 200.0);
774 state.line_to(100.0, 100.0); state.close_path(); assert_eq!(state.path.len(), 4);
779 }
780}