1use std::cmp::Ordering;
2
3#[cfg(feature = "semantic")]
4use serde::{Deserialize, Serialize};
5
6#[derive(Debug, Clone)]
11#[cfg_attr(
12 feature = "semantic",
13 derive(Serialize, Deserialize),
14 serde(tag = "type", rename_all = "snake_case")
15)]
16pub enum Element {
17 Title(ElementData),
18 Paragraph(ElementData),
19 Table(TableElementData),
20 Header(ElementData),
21 Footer(ElementData),
22 ListItem(ElementData),
23 Image(ImageElementData),
24 CodeBlock(ElementData),
25 KeyValue(KeyValueElementData),
26}
27
28impl Element {
29 pub fn text(&self) -> &str {
31 match self {
32 Self::Title(d)
33 | Self::Paragraph(d)
34 | Self::Header(d)
35 | Self::Footer(d)
36 | Self::ListItem(d)
37 | Self::CodeBlock(d) => &d.text,
38 Self::Table(t) => {
39 let _ = t;
42 ""
43 }
44 Self::Image(img) => img.alt_text.as_deref().unwrap_or(""),
45 Self::KeyValue(kv) => &kv.value,
46 }
47 }
48
49 pub fn display_text(&self) -> String {
57 match self {
58 Self::Table(t) => t
59 .rows
60 .iter()
61 .map(|row| row.join(" | "))
62 .collect::<Vec<_>>()
63 .join("\n"),
64 Self::Image(img) => img.alt_text.clone().unwrap_or_default(),
65 Self::KeyValue(kv) => format!("{}: {}", kv.key, kv.value),
66 _ => self.text().to_string(),
67 }
68 }
69
70 pub fn page(&self) -> u32 {
72 self.metadata().page
73 }
74
75 pub fn bbox(&self) -> &ElementBBox {
77 &self.metadata().bbox
78 }
79
80 pub fn metadata(&self) -> &ElementMetadata {
82 match self {
83 Self::Title(d)
84 | Self::Paragraph(d)
85 | Self::Header(d)
86 | Self::Footer(d)
87 | Self::ListItem(d)
88 | Self::CodeBlock(d) => &d.metadata,
89 Self::Table(t) => &t.metadata,
90 Self::Image(img) => &img.metadata,
91 Self::KeyValue(kv) => &kv.metadata,
92 }
93 }
94
95 pub fn metadata_mut(&mut self) -> &mut ElementMetadata {
97 match self {
98 Self::Title(d)
99 | Self::Paragraph(d)
100 | Self::Header(d)
101 | Self::Footer(d)
102 | Self::ListItem(d)
103 | Self::CodeBlock(d) => &mut d.metadata,
104 Self::Table(t) => &mut t.metadata,
105 Self::Image(img) => &mut img.metadata,
106 Self::KeyValue(kv) => &mut kv.metadata,
107 }
108 }
109
110 pub fn type_name(&self) -> &'static str {
116 match self {
117 Self::Title(_) => "title",
118 Self::Paragraph(_) => "paragraph",
119 Self::Table(_) => "table",
120 Self::Header(_) => "header",
121 Self::Footer(_) => "footer",
122 Self::ListItem(_) => "list_item",
123 Self::Image(_) => "image",
124 Self::CodeBlock(_) => "code_block",
125 Self::KeyValue(_) => "key_value",
126 }
127 }
128
129 pub fn set_parent_heading(&mut self, heading: Option<String>) {
131 self.metadata_mut().parent_heading = heading;
132 }
133
134 pub fn row_count(&self) -> Option<usize> {
136 match self {
137 Self::Table(t) => Some(t.rows.len()),
138 _ => None,
139 }
140 }
141
142 pub fn column_count(&self) -> Option<usize> {
144 match self {
145 Self::Table(t) => t.rows.first().map(|r| r.len()),
146 _ => None,
147 }
148 }
149
150 pub fn cell(&self, row: usize, col: usize) -> Option<&str> {
152 match self {
153 Self::Table(t) => t.rows.get(row).and_then(|r| r.get(col)).map(|s| s.as_str()),
154 _ => None,
155 }
156 }
157}
158
159pub fn element_reading_order(a: &Element, b: &Element) -> Ordering {
164 let page_cmp = a.page().cmp(&b.page());
165 if page_cmp != Ordering::Equal {
166 return page_cmp;
167 }
168 let y_cmp = b.bbox().y.total_cmp(&a.bbox().y);
170 if y_cmp != Ordering::Equal {
171 return y_cmp;
172 }
173 a.bbox().x.total_cmp(&b.bbox().x)
174}
175
176impl PartialEq for Element {
177 fn eq(&self, other: &Self) -> bool {
178 std::mem::discriminant(self) == std::mem::discriminant(other) && self.content_eq(other)
179 }
180}
181
182impl Eq for Element {}
183
184impl Element {
185 fn content_eq(&self, other: &Self) -> bool {
187 match (self, other) {
188 (Self::Title(a), Self::Title(b))
189 | (Self::Paragraph(a), Self::Paragraph(b))
190 | (Self::Header(a), Self::Header(b))
191 | (Self::Footer(a), Self::Footer(b))
192 | (Self::ListItem(a), Self::ListItem(b))
193 | (Self::CodeBlock(a), Self::CodeBlock(b)) => a.text == b.text,
194 (Self::Table(a), Self::Table(b)) => a.rows == b.rows,
195 (Self::Image(a), Self::Image(b)) => a.alt_text == b.alt_text,
196 (Self::KeyValue(a), Self::KeyValue(b)) => a.key == b.key && a.value == b.value,
197 _ => false,
198 }
199 }
200}
201
202#[derive(Debug, Clone)]
204#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
205pub struct ElementData {
206 pub text: String,
207 pub metadata: ElementMetadata,
208}
209
210#[derive(Debug, Clone)]
212#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
213pub struct TableElementData {
214 pub rows: Vec<Vec<String>>,
216 pub metadata: ElementMetadata,
217}
218
219#[derive(Debug, Clone)]
221#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
222pub struct ImageElementData {
223 pub alt_text: Option<String>,
224 pub metadata: ElementMetadata,
225}
226
227#[derive(Debug, Clone)]
229#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
230pub struct KeyValueElementData {
231 pub key: String,
232 pub value: String,
233 pub metadata: ElementMetadata,
234}
235
236#[derive(Debug, Clone)]
238#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
239pub struct ElementMetadata {
240 pub page: u32,
242 pub bbox: ElementBBox,
244 pub confidence: f64,
246 pub font_name: Option<String>,
248 pub font_size: Option<f64>,
250 pub is_bold: bool,
252 pub is_italic: bool,
254 pub parent_heading: Option<String>,
256}
257
258impl Default for ElementMetadata {
259 fn default() -> Self {
260 Self {
261 page: 0,
262 bbox: ElementBBox::ZERO,
263 confidence: 1.0,
264 font_name: None,
265 font_size: None,
266 is_bold: false,
267 is_italic: false,
268 parent_heading: None,
269 }
270 }
271}
272
273#[derive(Debug, Clone, Copy, PartialEq)]
275#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
276pub struct ElementBBox {
277 pub x: f64,
279 pub y: f64,
281 pub width: f64,
283 pub height: f64,
285}
286
287impl ElementBBox {
288 pub const ZERO: Self = Self {
290 x: 0.0,
291 y: 0.0,
292 width: 0.0,
293 height: 0.0,
294 };
295
296 pub fn new(x: f64, y: f64, width: f64, height: f64) -> Self {
298 Self {
299 x,
300 y,
301 width,
302 height,
303 }
304 }
305
306 pub fn right(&self) -> f64 {
308 self.x + self.width
309 }
310
311 pub fn top(&self) -> f64 {
313 self.y + self.height
314 }
315}