use crate::parser::content::{ContentOperation, ContentParser};
use crate::parser::{ParseError, PdfDocument};
use std::fmt;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LineOrientation {
Horizontal,
Vertical,
Diagonal,
}
#[derive(Debug, Clone, PartialEq)]
pub struct VectorLine {
pub x1: f64,
pub y1: f64,
pub x2: f64,
pub y2: f64,
pub orientation: LineOrientation,
pub stroke_width: f64,
pub is_stroked: bool,
pub color: Option<crate::graphics::Color>,
}
impl VectorLine {
pub fn new(
x1: f64,
y1: f64,
x2: f64,
y2: f64,
stroke_width: f64,
is_stroked: bool,
color: Option<crate::graphics::Color>,
) -> Self {
let orientation = Self::compute_orientation(x1, y1, x2, y2);
Self {
x1,
y1,
x2,
y2,
orientation,
stroke_width,
is_stroked,
color,
}
}
fn compute_orientation(x1: f64, y1: f64, x2: f64, y2: f64) -> LineOrientation {
const TOLERANCE: f64 = 0.1;
let dx = (x2 - x1).abs();
let dy = (y2 - y1).abs();
if dy < TOLERANCE {
LineOrientation::Horizontal
} else if dx < TOLERANCE {
LineOrientation::Vertical
} else {
LineOrientation::Diagonal
}
}
pub fn length(&self) -> f64 {
let dx = self.x2 - self.x1;
let dy = self.y2 - self.y1;
(dx * dx + dy * dy).sqrt()
}
pub fn midpoint(&self) -> (f64, f64) {
((self.x1 + self.x2) / 2.0, (self.y1 + self.y2) / 2.0)
}
}
#[derive(Debug, Clone, Default)]
pub struct ExtractedGraphics {
pub lines: Vec<VectorLine>,
pub horizontal_count: usize,
pub vertical_count: usize,
}
impl ExtractedGraphics {
pub fn new() -> Self {
Self::default()
}
pub fn add_line(&mut self, line: VectorLine) {
match line.orientation {
LineOrientation::Horizontal => self.horizontal_count += 1,
LineOrientation::Vertical => self.vertical_count += 1,
LineOrientation::Diagonal => {} }
self.lines.push(line);
}
pub fn horizontal_lines(&self) -> impl Iterator<Item = &VectorLine> {
self.lines
.iter()
.filter(|l| l.orientation == LineOrientation::Horizontal)
}
pub fn vertical_lines(&self) -> impl Iterator<Item = &VectorLine> {
self.lines
.iter()
.filter(|l| l.orientation == LineOrientation::Vertical)
}
pub fn has_table_structure(&self) -> bool {
self.horizontal_count >= 2 && self.vertical_count >= 2
}
}
#[derive(Debug, Clone)]
pub struct ExtractionConfig {
pub min_line_length: f64,
pub extract_diagonals: bool,
pub stroked_only: bool,
}
impl Default for ExtractionConfig {
fn default() -> Self {
Self {
min_line_length: 1.0, extract_diagonals: false, stroked_only: true, }
}
}
pub struct GraphicsExtractor {
config: ExtractionConfig,
}
impl GraphicsExtractor {
pub fn new(config: ExtractionConfig) -> Self {
Self { config }
}
pub fn default() -> Self {
Self::new(ExtractionConfig::default())
}
pub fn config(&self) -> &ExtractionConfig {
&self.config
}
pub fn extract_from_page<R: std::io::Read + std::io::Seek>(
&mut self,
document: &PdfDocument<R>,
page_index: usize,
) -> Result<ExtractedGraphics, ExtractionError> {
let page = document
.get_page(page_index as u32)
.map_err(|e| ExtractionError::ParseError(format!("Failed to get page: {}", e)))?;
let streams = document
.get_page_content_streams(&page)
.map_err(|e| ExtractionError::ParseError(format!("Failed to get content: {}", e)))?;
let mut graphics = ExtractedGraphics::new();
let mut state = GraphicsState::new();
for stream in streams {
let operations = ContentParser::parse(&stream).map_err(|e| {
ExtractionError::ParseError(format!("Failed to parse content: {}", e))
})?;
self.process_operations(&operations, &mut state, &mut graphics)?;
}
Ok(graphics)
}
fn process_operations(
&self,
operations: &[ContentOperation],
state: &mut GraphicsState,
graphics: &mut ExtractedGraphics,
) -> Result<(), ExtractionError> {
for op in operations {
match op {
ContentOperation::SaveGraphicsState => state.save(),
ContentOperation::RestoreGraphicsState => state.restore(),
ContentOperation::SetLineWidth(w) => state.stroke_width = *w as f64,
ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
state.apply_transform(
*a as f64, *b as f64, *c as f64, *d as f64, *e as f64, *f as f64,
);
}
ContentOperation::SetStrokingGray(gray) => {
state.stroke_color = Some(crate::graphics::Color::gray(*gray as f64));
}
ContentOperation::SetStrokingRGB(r, g, b) => {
state.stroke_color =
Some(crate::graphics::Color::rgb(*r as f64, *g as f64, *b as f64));
}
ContentOperation::SetStrokingCMYK(c, m, y, k) => {
state.stroke_color = Some(crate::graphics::Color::cmyk(
*c as f64, *m as f64, *y as f64, *k as f64,
));
}
ContentOperation::MoveTo(x, y) => {
let (tx, ty) = state.transform_point(*x as f64, *y as f64);
state.move_to(tx, ty);
}
ContentOperation::LineTo(x, y) => {
let (tx, ty) = state.transform_point(*x as f64, *y as f64);
state.line_to(tx, ty);
}
ContentOperation::Rectangle(x, y, width, height) => {
self.extract_rectangle_lines(
*x as f64,
*y as f64,
*width as f64,
*height as f64,
state,
graphics,
);
}
ContentOperation::ClosePath => {
state.close_path();
}
ContentOperation::Stroke | ContentOperation::CloseStroke => {
self.extract_path_lines(state, graphics, true);
state.clear_path();
}
ContentOperation::Fill | ContentOperation::FillEvenOdd => {
if !self.config.stroked_only {
self.extract_path_lines(state, graphics, false);
}
state.clear_path();
}
_ => {} }
}
Ok(())
}
fn extract_rectangle_lines(
&self,
x: f64,
y: f64,
width: f64,
height: f64,
state: &GraphicsState,
graphics: &mut ExtractedGraphics,
) {
let stroke_width = state.stroke_width;
let (x1, y1) = state.transform_point(x, y); let (x2, y2) = state.transform_point(x + width, y); let (x3, y3) = state.transform_point(x + width, y + height); let (x4, y4) = state.transform_point(x, y + height);
graphics.add_line(VectorLine::new(x1, y1, x2, y2, stroke_width, true, None));
graphics.add_line(VectorLine::new(x2, y2, x3, y3, stroke_width, true, None));
graphics.add_line(VectorLine::new(x3, y3, x4, y4, stroke_width, true, None));
graphics.add_line(VectorLine::new(x4, y4, x1, y1, stroke_width, true, None));
}
fn extract_path_lines(
&self,
state: &GraphicsState,
graphics: &mut ExtractedGraphics,
is_stroked: bool,
) {
let stroke_width = state.stroke_width;
for segment in &state.path {
let PathSegment::Line { x1, y1, x2, y2 } = segment;
let line = VectorLine::new(
*x1,
*y1,
*x2,
*y2,
stroke_width,
is_stroked,
state.stroke_color,
);
if self.config.stroked_only && !is_stroked {
continue;
}
if line.length() < self.config.min_line_length {
continue;
}
if !self.config.extract_diagonals && line.orientation == LineOrientation::Diagonal {
continue;
}
graphics.add_line(line);
}
}
}
struct GraphicsState {
ctm: [f64; 6],
stroke_width: f64,
stroke_color: Option<crate::graphics::Color>,
path: Vec<PathSegment>,
current_point: Option<(f64, f64)>,
state_stack: Vec<SavedState>,
}
#[derive(Clone)]
struct SavedState {
ctm: [f64; 6],
stroke_width: f64,
stroke_color: Option<crate::graphics::Color>,
}
#[derive(Debug, Clone)]
enum PathSegment {
Line { x1: f64, y1: f64, x2: f64, y2: f64 },
}
impl GraphicsState {
fn new() -> Self {
Self {
ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0], stroke_width: 1.0,
stroke_color: None,
path: Vec::new(),
current_point: None,
state_stack: Vec::new(),
}
}
fn save(&mut self) {
self.state_stack.push(SavedState {
ctm: self.ctm,
stroke_width: self.stroke_width,
stroke_color: self.stroke_color,
});
}
fn restore(&mut self) {
if let Some(saved) = self.state_stack.pop() {
self.ctm = saved.ctm;
self.stroke_width = saved.stroke_width;
self.stroke_color = saved.stroke_color;
}
}
fn apply_transform(&mut self, a: f64, b: f64, c: f64, d: f64, e: f64, f: f64) {
let [a0, b0, c0, d0, e0, f0] = self.ctm;
self.ctm = [
a * a0 + b * c0,
a * b0 + b * d0,
c * a0 + d * c0,
c * b0 + d * d0,
e * a0 + f * c0 + e0,
e * b0 + f * d0 + f0,
];
}
fn transform_point(&self, x: f64, y: f64) -> (f64, f64) {
let [a, b, c, d, e, f] = self.ctm;
let tx = a * x + c * y + e;
let ty = b * x + d * y + f;
(tx, ty)
}
fn move_to(&mut self, x: f64, y: f64) {
self.current_point = Some((x, y));
}
fn line_to(&mut self, x: f64, y: f64) {
if let Some((x1, y1)) = self.current_point {
self.path.push(PathSegment::Line {
x1,
y1,
x2: x,
y2: y,
});
self.current_point = Some((x, y));
}
}
fn close_path(&mut self) {
if let Some((start_x, start_y)) = self.path.first().map(|seg| match seg {
PathSegment::Line { x1, y1, .. } => (*x1, *y1),
}) {
if let Some((x, y)) = self.current_point {
const EPSILON: f64 = 0.01;
if (x - start_x).abs() > EPSILON || (y - start_y).abs() > EPSILON {
self.path.push(PathSegment::Line {
x1: x,
y1: y,
x2: start_x,
y2: start_y,
});
self.current_point = Some((start_x, start_y));
}
}
}
}
fn clear_path(&mut self) {
self.path.clear();
self.current_point = None;
}
}
#[derive(Debug)]
pub enum ExtractionError {
InvalidOperator(String),
InvalidOperand(String),
IoError(std::io::Error),
ParseError(String),
}
impl fmt::Display for ExtractionError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::InvalidOperator(op) => write!(f, "Invalid graphics operator: {}", op),
Self::InvalidOperand(msg) => write!(f, "Invalid operand: {}", msg),
Self::IoError(e) => write!(f, "I/O error: {}", e),
Self::ParseError(msg) => write!(f, "Parse error: {}", msg),
}
}
}
impl std::error::Error for ExtractionError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::IoError(e) => Some(e),
_ => None,
}
}
}
impl From<std::io::Error> for ExtractionError {
fn from(err: std::io::Error) -> Self {
Self::IoError(err)
}
}
impl From<ParseError> for ExtractionError {
fn from(err: ParseError) -> Self {
Self::ParseError(format!("{}", err))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_line_orientation_horizontal() {
let line = VectorLine::new(100.0, 200.0, 300.0, 200.0, 1.0, true, None);
assert_eq!(line.orientation, LineOrientation::Horizontal);
}
#[test]
fn test_line_orientation_vertical() {
let line = VectorLine::new(100.0, 200.0, 100.0, 400.0, 1.0, true, None);
assert_eq!(line.orientation, LineOrientation::Vertical);
}
#[test]
fn test_line_orientation_diagonal() {
let line = VectorLine::new(100.0, 200.0, 300.0, 400.0, 1.0, true, None);
assert_eq!(line.orientation, LineOrientation::Diagonal);
}
#[test]
fn test_line_orientation_tolerance() {
let line = VectorLine::new(100.0, 200.0, 300.0, 200.05, 1.0, true, None);
assert_eq!(line.orientation, LineOrientation::Horizontal);
let line = VectorLine::new(100.0, 200.0, 100.05, 400.0, 1.0, true, None);
assert_eq!(line.orientation, LineOrientation::Vertical);
}
#[test]
fn test_line_length() {
let line = VectorLine::new(0.0, 0.0, 3.0, 4.0, 1.0, true, None);
assert!((line.length() - 5.0).abs() < 0.001); }
#[test]
fn test_line_midpoint() {
let line = VectorLine::new(100.0, 200.0, 300.0, 400.0, 1.0, true, None);
let (mx, my) = line.midpoint();
assert!((mx - 200.0).abs() < 0.001);
assert!((my - 300.0).abs() < 0.001);
}
#[test]
fn test_extracted_graphics_add_line() {
let mut graphics = ExtractedGraphics::new();
graphics.add_line(VectorLine::new(0.0, 0.0, 100.0, 0.0, 1.0, true, None)); graphics.add_line(VectorLine::new(0.0, 0.0, 0.0, 100.0, 1.0, true, None)); graphics.add_line(VectorLine::new(0.0, 0.0, 100.0, 100.0, 1.0, true, None));
assert_eq!(graphics.horizontal_count, 1);
assert_eq!(graphics.vertical_count, 1);
assert_eq!(graphics.lines.len(), 3);
}
#[test]
fn test_extracted_graphics_iterators() {
let mut graphics = ExtractedGraphics::new();
graphics.add_line(VectorLine::new(0.0, 0.0, 100.0, 0.0, 1.0, true, None)); graphics.add_line(VectorLine::new(0.0, 0.0, 0.0, 100.0, 1.0, true, None)); graphics.add_line(VectorLine::new(0.0, 100.0, 100.0, 100.0, 1.0, true, None));
assert_eq!(graphics.horizontal_lines().count(), 2);
assert_eq!(graphics.vertical_lines().count(), 1);
}
#[test]
fn test_has_table_structure() {
let mut graphics = ExtractedGraphics::new();
assert!(!graphics.has_table_structure());
graphics.add_line(VectorLine::new(0.0, 0.0, 100.0, 0.0, 1.0, true, None));
graphics.add_line(VectorLine::new(0.0, 100.0, 100.0, 100.0, 1.0, true, None));
graphics.add_line(VectorLine::new(0.0, 0.0, 0.0, 100.0, 1.0, true, None));
assert!(!graphics.has_table_structure());
graphics.add_line(VectorLine::new(100.0, 0.0, 100.0, 100.0, 1.0, true, None));
assert!(graphics.has_table_structure());
}
#[test]
fn test_extraction_config_default() {
let config = ExtractionConfig::default();
assert_eq!(config.min_line_length, 1.0);
assert!(!config.extract_diagonals);
assert!(config.stroked_only);
}
#[test]
fn test_ctm_transform_point_identity() {
let state = GraphicsState::new();
let (tx, ty) = state.transform_point(100.0, 200.0);
assert!((tx - 100.0).abs() < 0.001);
assert!((ty - 200.0).abs() < 0.001);
}
#[test]
fn test_ctm_transform_point_translation() {
let mut state = GraphicsState::new();
state.apply_transform(1.0, 0.0, 0.0, 1.0, 50.0, 75.0);
let (tx, ty) = state.transform_point(100.0, 200.0);
assert!((tx - 150.0).abs() < 0.001); assert!((ty - 275.0).abs() < 0.001); }
#[test]
fn test_ctm_transform_point_scale() {
let mut state = GraphicsState::new();
state.apply_transform(2.0, 0.0, 0.0, 2.0, 0.0, 0.0);
let (tx, ty) = state.transform_point(100.0, 200.0);
assert!((tx - 200.0).abs() < 0.001); assert!((ty - 400.0).abs() < 0.001); }
#[test]
fn test_ctm_transform_point_combined() {
let mut state = GraphicsState::new();
state.apply_transform(2.0, 0.0, 0.0, 2.0, 10.0, 20.0);
let (tx, ty) = state.transform_point(5.0, 5.0);
assert!((tx - 20.0).abs() < 0.001); assert!((ty - 30.0).abs() < 0.001); }
#[test]
fn test_graphics_state_save_restore() {
let mut state = GraphicsState::new();
state.stroke_width = 2.0;
state.apply_transform(2.0, 0.0, 0.0, 2.0, 10.0, 20.0);
state.save();
state.stroke_width = 5.0;
state.apply_transform(1.0, 0.0, 0.0, 1.0, 50.0, 50.0);
state.restore();
assert_eq!(state.stroke_width, 2.0);
let (tx, ty) = state.transform_point(5.0, 5.0);
assert!((tx - 20.0).abs() < 0.001);
assert!((ty - 30.0).abs() < 0.001);
}
#[test]
fn test_graphics_state_nested_save_restore() {
let mut state = GraphicsState::new();
state.stroke_width = 2.0;
state.save();
state.stroke_width = 5.0;
state.save();
state.stroke_width = 10.0;
state.restore();
assert_eq!(state.stroke_width, 5.0);
state.restore();
assert_eq!(state.stroke_width, 2.0);
state.restore();
assert_eq!(state.stroke_width, 2.0);
}
#[test]
fn test_close_path_creates_closing_line() {
let mut state = GraphicsState::new();
state.move_to(100.0, 100.0);
state.line_to(200.0, 100.0);
state.line_to(200.0, 200.0);
state.close_path();
assert_eq!(state.path.len(), 3);
let PathSegment::Line { x1, y1, x2, y2 } = &state.path[2];
assert!((*x1 - 200.0).abs() < 0.01);
assert!((*y1 - 200.0).abs() < 0.01);
assert!((*x2 - 100.0).abs() < 0.01);
assert!((*y2 - 100.0).abs() < 0.01);
}
#[test]
fn test_close_path_no_duplicate_if_already_closed() {
let mut state = GraphicsState::new();
state.move_to(100.0, 100.0);
state.line_to(200.0, 100.0);
state.line_to(200.0, 200.0);
state.line_to(100.0, 200.0);
state.line_to(100.0, 100.0); state.close_path();
assert_eq!(state.path.len(), 4);
}
}