Skip to main content

oxidize_pdf/text/structured/
types.rs

1//! Core data types for structured data extraction.
2
3use serde::{Deserialize, Serialize};
4
5/// Bounding box for spatial positioning.
6///
7/// Coordinates are in PDF user space units (typically 1/72 inch).
8#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
9pub struct BoundingBox {
10    /// X coordinate of bottom-left corner
11    pub x: f64,
12    /// Y coordinate of bottom-left corner
13    pub y: f64,
14    /// Width of the bounding box
15    pub width: f64,
16    /// Height of the bounding box
17    pub height: f64,
18}
19
20impl BoundingBox {
21    /// Creates a new bounding box.
22    pub fn new(x: f64, y: f64, width: f64, height: f64) -> Self {
23        Self {
24            x,
25            y,
26            width,
27            height,
28        }
29    }
30
31    /// Returns the right edge X coordinate.
32    pub fn right(&self) -> f64 {
33        self.x + self.width
34    }
35
36    /// Returns the top edge Y coordinate.
37    pub fn top(&self) -> f64 {
38        self.y + self.height
39    }
40
41    /// Checks if this bounding box contains a point.
42    pub fn contains(&self, x: f64, y: f64) -> bool {
43        x >= self.x && x <= self.right() && y >= self.y && y <= self.top()
44    }
45}
46
47/// A detected table structure.
48///
49/// Tables are detected by analyzing vertical and horizontal text alignment
50/// using clustering algorithms.
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct Table {
53    /// Table rows
54    pub rows: Vec<Row>,
55    /// Column definitions
56    pub columns: Vec<Column>,
57    /// Spatial extent of the entire table
58    pub bounding_box: BoundingBox,
59    /// Detection confidence score (0.0 to 1.0)
60    ///
61    /// Higher scores indicate more regular alignment and structure.
62    pub confidence: f64,
63}
64
65impl Table {
66    /// Creates a new table.
67    pub fn new(
68        rows: Vec<Row>,
69        columns: Vec<Column>,
70        bounding_box: BoundingBox,
71        confidence: f64,
72    ) -> Self {
73        Self {
74            rows,
75            columns,
76            bounding_box,
77            confidence,
78        }
79    }
80
81    /// Returns the number of rows.
82    pub fn row_count(&self) -> usize {
83        self.rows.len()
84    }
85
86    /// Returns the number of columns.
87    pub fn column_count(&self) -> usize {
88        self.columns.len()
89    }
90
91    /// Gets a cell at the specified row and column index.
92    pub fn get_cell(&self, row_idx: usize, col_idx: usize) -> Option<&Cell> {
93        self.rows.get(row_idx)?.cells.get(col_idx)
94    }
95}
96
97/// A single row in a table.
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct Row {
100    /// Cells in this row
101    pub cells: Vec<Cell>,
102    /// Y position of the row baseline
103    pub y_position: f64,
104    /// Height of the row
105    pub height: f64,
106}
107
108impl Row {
109    /// Creates a new row.
110    pub fn new(cells: Vec<Cell>, y_position: f64, height: f64) -> Self {
111        Self {
112            cells,
113            y_position,
114            height,
115        }
116    }
117}
118
119/// A single cell in a table.
120#[derive(Debug, Clone, Serialize, Deserialize)]
121pub struct Cell {
122    /// Text content of the cell
123    pub text: String,
124    /// Column index (0-based)
125    pub column_index: usize,
126    /// Spatial extent of the cell
127    pub bounding_box: BoundingBox,
128}
129
130impl Cell {
131    /// Creates a new empty cell.
132    pub fn new(column_index: usize, bounding_box: BoundingBox) -> Self {
133        Self {
134            text: String::new(),
135            column_index,
136            bounding_box,
137        }
138    }
139
140    /// Adds text to this cell.
141    pub fn add_text(&mut self, text: &str) {
142        if !self.text.is_empty() {
143            self.text.push(' ');
144        }
145        self.text.push_str(text);
146    }
147
148    /// Checks if the cell is empty.
149    pub fn is_empty(&self) -> bool {
150        self.text.trim().is_empty()
151    }
152}
153
154/// Column definition in a table.
155#[derive(Debug, Clone, Serialize, Deserialize)]
156pub struct Column {
157    /// X position of the column center
158    pub x_position: f64,
159    /// Width of the column
160    pub width: f64,
161    /// Detected text alignment
162    pub alignment: Alignment,
163}
164
165impl Column {
166    /// Creates a new column.
167    pub fn new(x_position: f64, width: f64, alignment: Alignment) -> Self {
168        Self {
169            x_position,
170            width,
171            alignment,
172        }
173    }
174
175    /// Returns the left edge of the column.
176    pub fn left(&self) -> f64 {
177        self.x_position - self.width / 2.0
178    }
179
180    /// Returns the right edge of the column.
181    pub fn right(&self) -> f64 {
182        self.x_position + self.width / 2.0
183    }
184}
185
186/// Text alignment within a column.
187#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
188pub enum Alignment {
189    /// Left-aligned text
190    Left,
191    /// Right-aligned text
192    Right,
193    /// Center-aligned text
194    Center,
195    /// Justified text
196    Justified,
197}
198
199impl Default for Alignment {
200    fn default() -> Self {
201        Alignment::Left
202    }
203}
204
205/// A detected key-value pair.
206///
207/// Key-value pairs are detected using multiple pattern matching strategies.
208#[derive(Debug, Clone, Serialize, Deserialize)]
209pub struct KeyValuePair {
210    /// The key (label) text
211    pub key: String,
212    /// The value text
213    pub value: String,
214    /// Detection confidence score (0.0 to 1.0)
215    pub confidence: f64,
216    /// The pattern used to detect this pair
217    pub pattern: KeyValuePattern,
218}
219
220impl KeyValuePair {
221    /// Creates a new key-value pair.
222    pub fn new(key: String, value: String, confidence: f64, pattern: KeyValuePattern) -> Self {
223        Self {
224            key,
225            value,
226            confidence,
227            pattern,
228        }
229    }
230}
231
232/// Pattern used to detect a key-value pair.
233#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
234pub enum KeyValuePattern {
235    /// Colon-separated format: "Label: Value"
236    ColonSeparated,
237    /// Spatially aligned format: "Label      Value"
238    SpatialAlignment,
239    /// Tab-separated format: "Label\tValue"
240    Tabular,
241}
242
243/// Configuration for structured data detection.
244#[derive(Debug, Clone)]
245pub struct StructuredDataConfig {
246    /// Minimum number of rows to consider something a table
247    pub min_table_rows: usize,
248    /// Minimum number of columns to consider something a table
249    pub min_table_columns: usize,
250    /// Tolerance for column alignment (in PDF units)
251    ///
252    /// Text fragments within this distance are considered aligned.
253    pub column_alignment_tolerance: f64,
254    /// Tolerance for row alignment (in PDF units)
255    pub row_alignment_tolerance: f64,
256    /// Enable table detection
257    pub detect_tables: bool,
258    /// Enable key-value pair detection
259    pub detect_key_value: bool,
260    /// Enable multi-column layout detection
261    pub detect_multi_column: bool,
262    /// Minimum horizontal gap to consider a column boundary (in PDF units)
263    pub min_column_gap: f64,
264}
265
266impl Default for StructuredDataConfig {
267    fn default() -> Self {
268        Self {
269            min_table_rows: 2,
270            min_table_columns: 2,
271            column_alignment_tolerance: 5.0,
272            row_alignment_tolerance: 3.0,
273            detect_tables: true,
274            detect_key_value: true,
275            detect_multi_column: true,
276            min_column_gap: 20.0,
277        }
278    }
279}
280
281impl StructuredDataConfig {
282    /// Creates a new configuration with default values.
283    pub fn new() -> Self {
284        Self::default()
285    }
286
287    /// Sets the minimum number of rows for table detection.
288    pub fn with_min_table_rows(mut self, rows: usize) -> Self {
289        self.min_table_rows = rows;
290        self
291    }
292
293    /// Sets the minimum number of columns for table detection.
294    pub fn with_min_table_columns(mut self, columns: usize) -> Self {
295        self.min_table_columns = columns;
296        self
297    }
298
299    /// Sets the column alignment tolerance.
300    pub fn with_column_tolerance(mut self, tolerance: f64) -> Self {
301        self.column_alignment_tolerance = tolerance;
302        self
303    }
304
305    /// Sets the row alignment tolerance.
306    pub fn with_row_tolerance(mut self, tolerance: f64) -> Self {
307        self.row_alignment_tolerance = tolerance;
308        self
309    }
310
311    /// Enables or disables table detection.
312    pub fn with_table_detection(mut self, enabled: bool) -> Self {
313        self.detect_tables = enabled;
314        self
315    }
316
317    /// Enables or disables key-value pair detection.
318    pub fn with_key_value_detection(mut self, enabled: bool) -> Self {
319        self.detect_key_value = enabled;
320        self
321    }
322
323    /// Enables or disables multi-column layout detection.
324    pub fn with_multi_column_detection(mut self, enabled: bool) -> Self {
325        self.detect_multi_column = enabled;
326        self
327    }
328}
329
330/// Multi-column layout boundary.
331#[derive(Debug, Clone, Serialize, Deserialize)]
332pub struct ColumnBoundary {
333    /// X position of the boundary
334    pub x_position: f64,
335    /// Width of the gap at this boundary
336    pub gap_width: f64,
337}
338
339impl ColumnBoundary {
340    /// Creates a new column boundary.
341    pub fn new(x_position: f64, gap_width: f64) -> Self {
342        Self {
343            x_position,
344            gap_width,
345        }
346    }
347}
348
349/// A section of text in a multi-column layout.
350#[derive(Debug, Clone, Serialize, Deserialize)]
351pub struct ColumnSection {
352    /// Column index (0-based, left to right)
353    pub column_index: usize,
354    /// Text content in reading order
355    pub text: String,
356    /// Spatial extent of this column section
357    pub bounding_box: BoundingBox,
358}
359
360impl ColumnSection {
361    /// Creates a new column section.
362    pub fn new(column_index: usize, text: String, bounding_box: BoundingBox) -> Self {
363        Self {
364            column_index,
365            text,
366            bounding_box,
367        }
368    }
369}
370
371/// Result of structured data detection.
372#[derive(Debug, Clone, Serialize, Deserialize)]
373pub struct StructuredDataResult {
374    /// Detected tables
375    pub tables: Vec<Table>,
376    /// Detected key-value pairs
377    pub key_value_pairs: Vec<KeyValuePair>,
378    /// Multi-column layout sections
379    pub column_sections: Vec<ColumnSection>,
380}
381
382impl StructuredDataResult {
383    /// Creates a new empty result.
384    pub fn new() -> Self {
385        Self {
386            tables: Vec::new(),
387            key_value_pairs: Vec::new(),
388            column_sections: Vec::new(),
389        }
390    }
391}
392
393impl Default for StructuredDataResult {
394    fn default() -> Self {
395        Self::new()
396    }
397}
398
399#[cfg(test)]
400mod tests {
401    use super::*;
402
403    #[test]
404    fn test_bounding_box_basic() {
405        let bbox = BoundingBox::new(10.0, 20.0, 100.0, 50.0);
406        assert_eq!(bbox.x, 10.0);
407        assert_eq!(bbox.y, 20.0);
408        assert_eq!(bbox.width, 100.0);
409        assert_eq!(bbox.height, 50.0);
410        assert_eq!(bbox.right(), 110.0);
411        assert_eq!(bbox.top(), 70.0);
412    }
413
414    #[test]
415    fn test_bounding_box_contains() {
416        let bbox = BoundingBox::new(10.0, 20.0, 100.0, 50.0);
417        assert!(bbox.contains(50.0, 40.0)); // inside
418        assert!(bbox.contains(10.0, 20.0)); // bottom-left corner
419        assert!(bbox.contains(110.0, 70.0)); // top-right corner
420        assert!(!bbox.contains(5.0, 40.0)); // outside left
421        assert!(!bbox.contains(120.0, 40.0)); // outside right
422    }
423
424    #[test]
425    fn test_cell_operations() {
426        let bbox = BoundingBox::new(0.0, 0.0, 50.0, 20.0);
427        let mut cell = Cell::new(0, bbox);
428
429        assert!(cell.is_empty());
430
431        cell.add_text("Hello");
432        assert_eq!(cell.text, "Hello");
433        assert!(!cell.is_empty());
434
435        cell.add_text("World");
436        assert_eq!(cell.text, "Hello World");
437    }
438
439    #[test]
440    fn test_column_edges() {
441        let column = Column::new(100.0, 50.0, Alignment::Left);
442        assert_eq!(column.left(), 75.0);
443        assert_eq!(column.right(), 125.0);
444    }
445
446    #[test]
447    fn test_table_accessors() {
448        let bbox = BoundingBox::new(0.0, 0.0, 200.0, 100.0);
449        let cell = Cell::new(0, BoundingBox::new(0.0, 0.0, 50.0, 25.0));
450        let row = Row::new(vec![cell], 0.0, 25.0);
451        let column = Column::new(25.0, 50.0, Alignment::Left);
452
453        let table = Table::new(vec![row], vec![column], bbox, 0.95);
454
455        assert_eq!(table.row_count(), 1);
456        assert_eq!(table.column_count(), 1);
457        assert!(table.get_cell(0, 0).is_some());
458        assert!(table.get_cell(1, 0).is_none());
459    }
460
461    #[test]
462    fn test_config_builder() {
463        let config = StructuredDataConfig::new()
464            .with_min_table_rows(3)
465            .with_min_table_columns(4)
466            .with_column_tolerance(10.0)
467            .with_table_detection(false);
468
469        assert_eq!(config.min_table_rows, 3);
470        assert_eq!(config.min_table_columns, 4);
471        assert_eq!(config.column_alignment_tolerance, 10.0);
472        assert!(!config.detect_tables);
473    }
474
475    #[test]
476    fn test_alignment_default() {
477        assert_eq!(Alignment::default(), Alignment::Left);
478    }
479}