Skip to main content

edgeparse_core/models/
table.rs

1//! Table structures — TableBorder, TableBorderRow, TableBorderCell.
2
3use serde::{Deserialize, Serialize};
4
5use super::bbox::BoundingBox;
6use super::chunks::TextChunk;
7use super::content::ContentElement;
8use super::enums::SemanticType;
9
10/// Epsilon for table border coordinate comparisons.
11pub const TABLE_BORDER_EPSILON: f64 = 0.5;
12
13/// Minimum intersection for assigning content to cells.
14pub const MIN_CELL_CONTENT_INTERSECTION_PERCENT: f64 = 0.01;
15
16/// Grid-based table structure defined by row/column coordinates.
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct TableBorder {
19    /// Bounding box
20    pub bbox: BoundingBox,
21    /// Global index
22    pub index: Option<u32>,
23    /// Nesting level
24    pub level: Option<String>,
25    /// X-coordinates of column boundaries (N+1 for N columns)
26    pub x_coordinates: Vec<f64>,
27    /// Widths of column boundary lines
28    pub x_widths: Vec<f64>,
29    /// Y-coordinates of row boundaries (M+1 for M rows)
30    pub y_coordinates: Vec<f64>,
31    /// Widths of row boundary lines
32    pub y_widths: Vec<f64>,
33    /// Table rows
34    pub rows: Vec<TableBorderRow>,
35    /// Number of rows
36    pub num_rows: usize,
37    /// Number of columns
38    pub num_columns: usize,
39    /// Whether this table has structural problems
40    pub is_bad_table: bool,
41    /// Whether this came from a transformer model
42    pub is_table_transformer: bool,
43    /// Previous table in cross-page chain
44    pub previous_table: Option<Box<TableBorder>>,
45    /// Next table in cross-page chain
46    pub next_table: Option<Box<TableBorder>>,
47}
48
49/// A row in a TableBorder.
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct TableBorderRow {
52    /// Bounding box
53    pub bbox: BoundingBox,
54    /// Global index
55    pub index: Option<u32>,
56    /// Nesting level
57    pub level: Option<String>,
58    /// Row number (0-based)
59    pub row_number: usize,
60    /// Cells in this row
61    pub cells: Vec<TableBorderCell>,
62    /// Optional semantic type (header, body, footer)
63    pub semantic_type: Option<SemanticType>,
64}
65
66/// A cell in a TableBorderRow.
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct TableBorderCell {
69    /// Bounding box
70    pub bbox: BoundingBox,
71    /// Global index
72    pub index: Option<u32>,
73    /// Nesting level
74    pub level: Option<String>,
75    /// Row number (0-based)
76    pub row_number: usize,
77    /// Column number (0-based)
78    pub col_number: usize,
79    /// Number of rows this cell spans
80    pub row_span: usize,
81    /// Number of columns this cell spans
82    pub col_span: usize,
83    /// Raw text content (table tokens)
84    pub content: Vec<TableToken>,
85    /// Processed content elements (after sub-pipeline)
86    pub contents: Vec<ContentElement>,
87    /// Optional semantic type
88    pub semantic_type: Option<SemanticType>,
89}
90
91/// A text chunk assigned to a table cell.
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct TableToken {
94    /// Base text chunk
95    pub base: TextChunk,
96    /// Token type
97    pub token_type: TableTokenType,
98}
99
100/// Type of content in a table cell.
101#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
102pub enum TableTokenType {
103    /// Text content
104    Text,
105    /// Image content
106    Image,
107    /// Nested table
108    Table,
109}
110
111/// Row of tokens in a table cell.
112pub type TableTokenRow = Vec<TableToken>;
113
114/// Collection of detected table borders, indexed by page.
115#[derive(Debug, Clone, Default)]
116pub struct TableBordersCollection {
117    /// Per-page table borders
118    pub table_borders: Vec<Vec<TableBorder>>,
119}
120
121impl TableBordersCollection {
122    /// Create a new collection for the given number of pages.
123    pub fn new(num_pages: usize) -> Self {
124        Self {
125            table_borders: vec![Vec::new(); num_pages],
126        }
127    }
128
129    /// Add a table border to a page.
130    pub fn add(&mut self, page: usize, border: TableBorder) {
131        if page < self.table_borders.len() {
132            self.table_borders[page].push(border);
133        }
134    }
135
136    /// Get table borders for a page.
137    pub fn get_page(&self, page: usize) -> &[TableBorder] {
138        if page < self.table_borders.len() {
139            &self.table_borders[page]
140        } else {
141            &[]
142        }
143    }
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149
150    #[test]
151    fn test_table_borders_collection() {
152        let mut collection = TableBordersCollection::new(5);
153        let border = TableBorder {
154            bbox: BoundingBox::new(Some(1), 10.0, 10.0, 200.0, 300.0),
155            index: None,
156            level: None,
157            x_coordinates: vec![10.0, 100.0, 200.0],
158            x_widths: vec![1.0, 1.0, 1.0],
159            y_coordinates: vec![10.0, 150.0, 300.0],
160            y_widths: vec![1.0, 1.0, 1.0],
161            rows: vec![],
162            num_rows: 2,
163            num_columns: 2,
164            is_bad_table: false,
165            is_table_transformer: false,
166            previous_table: None,
167            next_table: None,
168        };
169        collection.add(0, border);
170        assert_eq!(collection.get_page(0).len(), 1);
171        assert_eq!(collection.get_page(1).len(), 0);
172        assert_eq!(collection.get_page(10).len(), 0);
173    }
174}