1use serde::{Deserialize, Serialize};
4
5use super::bbox::BoundingBox;
6use super::chunks::ImageChunk;
7use super::chunks::LineArtChunk;
8use super::content::ContentElement;
9use super::enums::{SemanticType, TextFormat};
10use super::table::TableBorder;
11use super::text::TextColumn;
12
13#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct SemanticTextNode {
16 pub bbox: BoundingBox,
18 pub index: Option<u32>,
20 pub level: Option<String>,
22 pub semantic_type: SemanticType,
24 pub correct_semantic_score: Option<f64>,
26 pub columns: Vec<TextColumn>,
28 pub font_weight: Option<f64>,
30 pub font_size: Option<f64>,
32 pub text_color: Option<Vec<f64>>,
34 pub italic_angle: Option<f64>,
36 pub font_name: Option<String>,
38 pub text_format: Option<TextFormat>,
40 pub max_font_size: Option<f64>,
42 pub background_color: Option<Vec<f64>>,
44 pub is_hidden_text: bool,
46}
47
48impl SemanticTextNode {
49 pub fn value(&self) -> String {
51 self.columns
52 .iter()
53 .map(|c| c.value())
54 .collect::<Vec<_>>()
55 .join("\n")
56 }
57
58 pub fn lines_number(&self) -> usize {
60 self.columns
61 .iter()
62 .flat_map(|c| &c.text_blocks)
63 .map(|b| b.text_lines.len())
64 .sum()
65 }
66
67 pub fn columns_number(&self) -> usize {
69 self.columns.len()
70 }
71
72 pub fn is_empty(&self) -> bool {
74 self.value().trim().is_empty()
75 }
76
77 pub fn is_space_node(&self) -> bool {
79 self.value().chars().all(|c| c.is_whitespace())
80 }
81
82 pub fn starts_with_arabic_number(&self) -> bool {
84 let text = self.value();
85 let trimmed = text.trim_start();
86 trimmed.starts_with(|c: char| c.is_ascii_digit())
87 }
88}
89
90#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct SemanticParagraph {
93 pub base: SemanticTextNode,
95 pub enclosed_top: bool,
97 pub enclosed_bottom: bool,
99 pub indentation: i32,
101}
102
103#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct SemanticHeading {
106 pub base: SemanticParagraph,
108 pub heading_level: Option<u32>,
110}
111
112#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct SemanticNumberHeading {
115 pub base: SemanticHeading,
117}
118
119#[derive(Debug, Clone, Serialize, Deserialize)]
121pub struct SemanticCaption {
122 pub base: SemanticTextNode,
124 pub linked_content_id: Option<u64>,
126}
127
128#[derive(Debug, Clone, Serialize, Deserialize)]
130pub struct SemanticHeaderOrFooter {
131 pub bbox: BoundingBox,
133 pub index: Option<u32>,
135 pub level: Option<String>,
137 pub semantic_type: SemanticType,
139 pub contents: Vec<ContentElement>,
141}
142
143#[derive(Debug, Clone, Serialize, Deserialize)]
145pub struct SemanticFigure {
146 pub bbox: BoundingBox,
148 pub index: Option<u32>,
150 pub level: Option<String>,
152 pub semantic_type: SemanticType,
154 pub images: Vec<ImageChunk>,
156 pub line_arts: Vec<LineArtChunk>,
158}
159
160#[derive(Debug, Clone, Serialize, Deserialize)]
162pub struct SemanticTable {
163 pub bbox: BoundingBox,
165 pub index: Option<u32>,
167 pub level: Option<String>,
169 pub semantic_type: SemanticType,
171 pub table_border: TableBorder,
173}
174
175#[derive(Debug, Clone, Serialize, Deserialize)]
177pub struct SemanticFormula {
178 pub bbox: BoundingBox,
180 pub index: Option<u32>,
182 pub level: Option<String>,
184 pub latex: String,
186}
187
188#[derive(Debug, Clone, Serialize, Deserialize)]
190pub struct SemanticPicture {
191 pub bbox: BoundingBox,
193 pub index: Option<u32>,
195 pub level: Option<String>,
197 pub image_index: u32,
199 pub description: String,
201}
202
203#[cfg(test)]
204mod tests {
205 use super::*;
206
207 fn make_empty_text_node() -> SemanticTextNode {
208 SemanticTextNode {
209 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 100.0, 12.0),
210 index: None,
211 level: None,
212 semantic_type: SemanticType::Paragraph,
213 correct_semantic_score: None,
214 columns: vec![],
215 font_weight: None,
216 font_size: None,
217 text_color: None,
218 italic_angle: None,
219 font_name: None,
220 text_format: None,
221 max_font_size: None,
222 background_color: None,
223 is_hidden_text: false,
224 }
225 }
226
227 #[test]
228 fn test_empty_text_node() {
229 let node = make_empty_text_node();
230 assert!(node.is_empty());
231 assert!(node.is_space_node());
232 assert_eq!(node.lines_number(), 0);
233 assert_eq!(node.columns_number(), 0);
234 }
235}