Skip to main content

edgeparse_core/models/
semantic.rs

1//! Semantic node types — SemanticParagraph, SemanticHeading, etc.
2
3use serde::{Deserialize, Serialize};
4
5use super::bbox::BoundingBox;
6use super::chunks::ImageChunk;
7use super::chunks::LineArtChunk;
8use super::content::ContentElement;
9use super::enums::{SemanticType, TextFormat};
10use super::table::TableBorder;
11use super::text::TextColumn;
12
13/// Base for all text-bearing semantic elements.
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct SemanticTextNode {
16    /// Bounding box
17    pub bbox: BoundingBox,
18    /// Global index
19    pub index: Option<u32>,
20    /// Nesting level
21    pub level: Option<String>,
22    /// Semantic classification
23    pub semantic_type: SemanticType,
24    /// Confidence score for semantic classification
25    pub correct_semantic_score: Option<f64>,
26    /// Text columns
27    pub columns: Vec<TextColumn>,
28    /// Dominant font weight
29    pub font_weight: Option<f64>,
30    /// Dominant font size
31    pub font_size: Option<f64>,
32    /// Dominant text color — original PDF color components (1=Gray, 3=RGB, 4=CMYK)
33    pub text_color: Option<Vec<f64>>,
34    /// Italic angle
35    pub italic_angle: Option<f64>,
36    /// Font name
37    pub font_name: Option<String>,
38    /// Text format
39    pub text_format: Option<TextFormat>,
40    /// Maximum font size in this node
41    pub max_font_size: Option<f64>,
42    /// Background color — original PDF color components (1=Gray, 3=RGB, 4=CMYK)
43    pub background_color: Option<Vec<f64>>,
44    /// Whether all text is hidden
45    pub is_hidden_text: bool,
46}
47
48impl SemanticTextNode {
49    /// Concatenated text value of all columns.
50    pub fn value(&self) -> String {
51        self.columns
52            .iter()
53            .map(|c| c.value())
54            .collect::<Vec<_>>()
55            .join("\n")
56    }
57
58    /// Number of text lines across all columns.
59    pub fn lines_number(&self) -> usize {
60        self.columns
61            .iter()
62            .flat_map(|c| &c.text_blocks)
63            .map(|b| b.text_lines.len())
64            .sum()
65    }
66
67    /// Number of columns.
68    pub fn columns_number(&self) -> usize {
69        self.columns.len()
70    }
71
72    /// Whether this node contains no text.
73    pub fn is_empty(&self) -> bool {
74        self.value().trim().is_empty()
75    }
76
77    /// Whether this node contains only whitespace.
78    pub fn is_space_node(&self) -> bool {
79        self.value().chars().all(|c| c.is_whitespace())
80    }
81
82    /// Whether the text starts with an Arabic (decimal) number.
83    pub fn starts_with_arabic_number(&self) -> bool {
84        let text = self.value();
85        let trimmed = text.trim_start();
86        trimmed.starts_with(|c: char| c.is_ascii_digit())
87    }
88}
89
90/// A semantic paragraph.
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct SemanticParagraph {
93    /// Base text node
94    pub base: SemanticTextNode,
95    /// Whether enclosed at top
96    pub enclosed_top: bool,
97    /// Whether enclosed at bottom
98    pub enclosed_bottom: bool,
99    /// Indentation level
100    pub indentation: i32,
101}
102
103/// A semantic heading.
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct SemanticHeading {
106    /// Base paragraph
107    pub base: SemanticParagraph,
108    /// Heading level (1-6, None if not yet assigned)
109    pub heading_level: Option<u32>,
110}
111
112/// A numbered heading (e.g., "1.2.3 Budget Overview").
113#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct SemanticNumberHeading {
115    /// Base heading
116    pub base: SemanticHeading,
117}
118
119/// A caption linked to an image or table.
120#[derive(Debug, Clone, Serialize, Deserialize)]
121pub struct SemanticCaption {
122    /// Base text node
123    pub base: SemanticTextNode,
124    /// ID of the linked content (image or table)
125    pub linked_content_id: Option<u64>,
126}
127
128/// Page header or footer.
129#[derive(Debug, Clone, Serialize, Deserialize)]
130pub struct SemanticHeaderOrFooter {
131    /// Bounding box
132    pub bbox: BoundingBox,
133    /// Global index
134    pub index: Option<u32>,
135    /// Nesting level
136    pub level: Option<String>,
137    /// Header or Footer
138    pub semantic_type: SemanticType,
139    /// Nested content elements
140    pub contents: Vec<ContentElement>,
141}
142
143/// A figure containing images and/or line art.
144#[derive(Debug, Clone, Serialize, Deserialize)]
145pub struct SemanticFigure {
146    /// Bounding box
147    pub bbox: BoundingBox,
148    /// Global index
149    pub index: Option<u32>,
150    /// Nesting level
151    pub level: Option<String>,
152    /// Semantic type
153    pub semantic_type: SemanticType,
154    /// Image chunks
155    pub images: Vec<ImageChunk>,
156    /// Line art chunks
157    pub line_arts: Vec<LineArtChunk>,
158}
159
160/// A semantic table wrapping a TableBorder.
161#[derive(Debug, Clone, Serialize, Deserialize)]
162pub struct SemanticTable {
163    /// Bounding box
164    pub bbox: BoundingBox,
165    /// Global index
166    pub index: Option<u32>,
167    /// Nesting level
168    pub level: Option<String>,
169    /// Semantic type
170    pub semantic_type: SemanticType,
171    /// Table border structure
172    pub table_border: TableBorder,
173}
174
175/// A LaTeX formula (from enrichment).
176#[derive(Debug, Clone, Serialize, Deserialize)]
177pub struct SemanticFormula {
178    /// Bounding box
179    pub bbox: BoundingBox,
180    /// Global index
181    pub index: Option<u32>,
182    /// Nesting level
183    pub level: Option<String>,
184    /// LaTeX representation
185    pub latex: String,
186}
187
188/// A described image (from enrichment).
189#[derive(Debug, Clone, Serialize, Deserialize)]
190pub struct SemanticPicture {
191    /// Bounding box
192    pub bbox: BoundingBox,
193    /// Global index
194    pub index: Option<u32>,
195    /// Nesting level
196    pub level: Option<String>,
197    /// Image index
198    pub image_index: u32,
199    /// Human-readable description
200    pub description: String,
201}
202
203#[cfg(test)]
204mod tests {
205    use super::*;
206
207    fn make_empty_text_node() -> SemanticTextNode {
208        SemanticTextNode {
209            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 100.0, 12.0),
210            index: None,
211            level: None,
212            semantic_type: SemanticType::Paragraph,
213            correct_semantic_score: None,
214            columns: vec![],
215            font_weight: None,
216            font_size: None,
217            text_color: None,
218            italic_angle: None,
219            font_name: None,
220            text_format: None,
221            max_font_size: None,
222            background_color: None,
223            is_hidden_text: false,
224        }
225    }
226
227    #[test]
228    fn test_empty_text_node() {
229        let node = make_empty_text_node();
230        assert!(node.is_empty());
231        assert!(node.is_space_node());
232        assert_eq!(node.lines_number(), 0);
233        assert_eq!(node.columns_number(), 0);
234    }
235}