Skip to main content

oxidize_pdf/pipeline/
element.rs

1use std::cmp::Ordering;
2
3#[cfg(feature = "semantic")]
4use serde::{Deserialize, Serialize};
5
6/// A typed document element extracted from a PDF page.
7///
8/// Each variant carries its specific data plus shared [`ElementMetadata`]
9/// for page number, bounding box, confidence, and optional font info.
10#[derive(Debug, Clone)]
11#[cfg_attr(
12    feature = "semantic",
13    derive(Serialize, Deserialize),
14    serde(tag = "type", rename_all = "snake_case")
15)]
16pub enum Element {
17    Title(ElementData),
18    Paragraph(ElementData),
19    Table(TableElementData),
20    Header(ElementData),
21    Footer(ElementData),
22    ListItem(ElementData),
23    Image(ImageElementData),
24    CodeBlock(ElementData),
25    KeyValue(KeyValueElementData),
26}
27
28impl Element {
29    /// Returns the primary text content of this element.
30    pub fn text(&self) -> &str {
31        match self {
32            Self::Title(d)
33            | Self::Paragraph(d)
34            | Self::Header(d)
35            | Self::Footer(d)
36            | Self::ListItem(d)
37            | Self::CodeBlock(d) => &d.text,
38            Self::Table(t) => {
39                // Tables don't have a single text — return empty.
40                // Use row_count()/cell() for structured access.
41                let _ = t;
42                ""
43            }
44            Self::Image(img) => img.alt_text.as_deref().unwrap_or(""),
45            Self::KeyValue(kv) => &kv.value,
46        }
47    }
48
49    /// Returns a human-readable text representation of this element.
50    ///
51    /// Unlike [`text()`](Self::text) which returns raw content (empty for tables,
52    /// value-only for KV pairs), this method produces a complete display form:
53    /// - Tables: pipe-separated rows
54    /// - Key-Value: "key: value"
55    /// - All others: same as `text()`
56    pub fn display_text(&self) -> String {
57        match self {
58            Self::Table(t) => t
59                .rows
60                .iter()
61                .map(|row| row.join(" | "))
62                .collect::<Vec<_>>()
63                .join("\n"),
64            Self::Image(img) => img.alt_text.clone().unwrap_or_default(),
65            Self::KeyValue(kv) => format!("{}: {}", kv.key, kv.value),
66            _ => self.text().to_string(),
67        }
68    }
69
70    /// Returns the page number (0-indexed) where this element appears.
71    pub fn page(&self) -> u32 {
72        self.metadata().page
73    }
74
75    /// Returns the bounding box of this element on the page.
76    pub fn bbox(&self) -> &ElementBBox {
77        &self.metadata().bbox
78    }
79
80    /// Returns the full metadata for this element.
81    pub fn metadata(&self) -> &ElementMetadata {
82        match self {
83            Self::Title(d)
84            | Self::Paragraph(d)
85            | Self::Header(d)
86            | Self::Footer(d)
87            | Self::ListItem(d)
88            | Self::CodeBlock(d) => &d.metadata,
89            Self::Table(t) => &t.metadata,
90            Self::Image(img) => &img.metadata,
91            Self::KeyValue(kv) => &kv.metadata,
92        }
93    }
94
95    /// Returns a mutable reference to the metadata of this element.
96    pub fn metadata_mut(&mut self) -> &mut ElementMetadata {
97        match self {
98            Self::Title(d)
99            | Self::Paragraph(d)
100            | Self::Header(d)
101            | Self::Footer(d)
102            | Self::ListItem(d)
103            | Self::CodeBlock(d) => &mut d.metadata,
104            Self::Table(t) => &mut t.metadata,
105            Self::Image(img) => &mut img.metadata,
106            Self::KeyValue(kv) => &mut kv.metadata,
107        }
108    }
109
110    /// Returns the snake_case type name of this element variant.
111    ///
112    /// Useful for logging, serialization, and metadata tagging.
113    /// Returns one of: `"title"`, `"paragraph"`, `"table"`, `"header"`,
114    /// `"footer"`, `"list_item"`, `"image"`, `"code_block"`, `"key_value"`.
115    pub fn type_name(&self) -> &'static str {
116        match self {
117            Self::Title(_) => "title",
118            Self::Paragraph(_) => "paragraph",
119            Self::Table(_) => "table",
120            Self::Header(_) => "header",
121            Self::Footer(_) => "footer",
122            Self::ListItem(_) => "list_item",
123            Self::Image(_) => "image",
124            Self::CodeBlock(_) => "code_block",
125            Self::KeyValue(_) => "key_value",
126        }
127    }
128
129    /// Set the parent heading for this element.
130    pub fn set_parent_heading(&mut self, heading: Option<String>) {
131        self.metadata_mut().parent_heading = heading;
132    }
133
134    /// Returns the number of rows if this is a Table element.
135    pub fn row_count(&self) -> Option<usize> {
136        match self {
137            Self::Table(t) => Some(t.rows.len()),
138            _ => None,
139        }
140    }
141
142    /// Returns the number of columns if this is a Table element.
143    pub fn column_count(&self) -> Option<usize> {
144        match self {
145            Self::Table(t) => t.rows.first().map(|r| r.len()),
146            _ => None,
147        }
148    }
149
150    /// Returns the cell text at (row, col) if this is a Table element.
151    pub fn cell(&self, row: usize, col: usize) -> Option<&str> {
152        match self {
153            Self::Table(t) => t.rows.get(row).and_then(|r| r.get(col)).map(|s| s.as_str()),
154            _ => None,
155        }
156    }
157}
158
159/// Comparator for sorting elements in natural reading order:
160/// page ASC, then Y DESC (top-to-bottom in PDF coordinates), then X ASC (left-to-right).
161///
162/// Use with `elements.sort_by(element_reading_order)` instead of `elements.sort()`.
163pub fn element_reading_order(a: &Element, b: &Element) -> Ordering {
164    let page_cmp = a.page().cmp(&b.page());
165    if page_cmp != Ordering::Equal {
166        return page_cmp;
167    }
168    // Higher Y = higher on page in PDF coords → should come first
169    let y_cmp = b.bbox().y.total_cmp(&a.bbox().y);
170    if y_cmp != Ordering::Equal {
171        return y_cmp;
172    }
173    a.bbox().x.total_cmp(&b.bbox().x)
174}
175
176impl PartialEq for Element {
177    fn eq(&self, other: &Self) -> bool {
178        std::mem::discriminant(self) == std::mem::discriminant(other) && self.content_eq(other)
179    }
180}
181
182impl Eq for Element {}
183
184impl Element {
185    /// Content-based equality: compares text/data content, ignoring metadata (position, font, etc.).
186    fn content_eq(&self, other: &Self) -> bool {
187        match (self, other) {
188            (Self::Title(a), Self::Title(b))
189            | (Self::Paragraph(a), Self::Paragraph(b))
190            | (Self::Header(a), Self::Header(b))
191            | (Self::Footer(a), Self::Footer(b))
192            | (Self::ListItem(a), Self::ListItem(b))
193            | (Self::CodeBlock(a), Self::CodeBlock(b)) => a.text == b.text,
194            (Self::Table(a), Self::Table(b)) => a.rows == b.rows,
195            (Self::Image(a), Self::Image(b)) => a.alt_text == b.alt_text,
196            (Self::KeyValue(a), Self::KeyValue(b)) => a.key == b.key && a.value == b.value,
197            _ => false,
198        }
199    }
200}
201
202/// Shared data for text-based element variants.
203#[derive(Debug, Clone)]
204#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
205pub struct ElementData {
206    pub text: String,
207    pub metadata: ElementMetadata,
208}
209
210/// Data specific to table elements.
211#[derive(Debug, Clone)]
212#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
213pub struct TableElementData {
214    /// Row-major cell data. Each inner Vec is one row.
215    pub rows: Vec<Vec<String>>,
216    pub metadata: ElementMetadata,
217}
218
219/// Data specific to image elements.
220#[derive(Debug, Clone)]
221#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
222pub struct ImageElementData {
223    pub alt_text: Option<String>,
224    pub metadata: ElementMetadata,
225}
226
227/// Data specific to key-value pair elements.
228#[derive(Debug, Clone)]
229#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
230pub struct KeyValueElementData {
231    pub key: String,
232    pub value: String,
233    pub metadata: ElementMetadata,
234}
235
236/// Metadata common to all element types.
237#[derive(Debug, Clone)]
238#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
239pub struct ElementMetadata {
240    /// Page number (0-indexed).
241    pub page: u32,
242    /// Bounding box on the page.
243    pub bbox: ElementBBox,
244    /// Classification confidence (0.0–1.0).
245    pub confidence: f64,
246    /// Font name if detected.
247    pub font_name: Option<String>,
248    /// Font size in points if detected.
249    pub font_size: Option<f64>,
250    /// Whether the text is bold.
251    pub is_bold: bool,
252    /// Whether the text is italic.
253    pub is_italic: bool,
254    /// The text of the nearest preceding Title element, if any.
255    pub parent_heading: Option<String>,
256}
257
258impl Default for ElementMetadata {
259    fn default() -> Self {
260        Self {
261            page: 0,
262            bbox: ElementBBox::ZERO,
263            confidence: 1.0,
264            font_name: None,
265            font_size: None,
266            is_bold: false,
267            is_italic: false,
268            parent_heading: None,
269        }
270    }
271}
272
273/// Axis-aligned bounding box for an element on a PDF page.
274#[derive(Debug, Clone, Copy, PartialEq)]
275#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
276pub struct ElementBBox {
277    /// Left edge X coordinate.
278    pub x: f64,
279    /// Bottom edge Y coordinate (PDF coordinate system).
280    pub y: f64,
281    /// Width of the bounding box.
282    pub width: f64,
283    /// Height of the bounding box.
284    pub height: f64,
285}
286
287impl ElementBBox {
288    /// A zero-sized bounding box at the origin.
289    pub const ZERO: Self = Self {
290        x: 0.0,
291        y: 0.0,
292        width: 0.0,
293        height: 0.0,
294    };
295
296    /// Creates a new bounding box.
297    pub fn new(x: f64, y: f64, width: f64, height: f64) -> Self {
298        Self {
299            x,
300            y,
301            width,
302            height,
303        }
304    }
305
306    /// Right edge X coordinate (x + width).
307    pub fn right(&self) -> f64 {
308        self.x + self.width
309    }
310
311    /// Top edge Y coordinate (y + height) in PDF coordinate system.
312    pub fn top(&self) -> f64 {
313        self.y + self.height
314    }
315}