edgeparse_core/models/
chunks.rs1use serde::{Deserialize, Serialize};
4
5use super::bbox::{BoundingBox, Vertex};
6use super::enums::{PdfLayer, TextFormat, TextType};
7
8#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct TextChunk {
11 pub value: String,
13 pub bbox: BoundingBox,
15 pub font_name: String,
17 pub font_size: f64,
19 pub font_weight: f64,
21 pub italic_angle: f64,
23 pub font_color: String,
25 pub contrast_ratio: f64,
27 pub symbol_ends: Vec<f64>,
29 pub text_format: TextFormat,
31 pub text_type: TextType,
33 pub pdf_layer: PdfLayer,
35 pub ocg_visible: bool,
37 pub index: Option<usize>,
39 pub page_number: Option<u32>,
41 pub level: Option<String>,
43 #[serde(skip_serializing_if = "Option::is_none")]
46 pub mcid: Option<i64>,
47}
48
49impl TextChunk {
50 pub fn is_white_space_chunk(&self) -> bool {
52 self.value.chars().all(|c| c.is_whitespace())
53 }
54
55 pub fn compress_spaces(&mut self) {
57 let mut result = String::with_capacity(self.value.len());
58 let mut last_was_space = false;
59 for ch in self.value.chars() {
60 if ch == ' ' {
61 if !last_was_space {
62 result.push(' ');
63 }
64 last_was_space = true;
65 } else {
66 result.push(ch);
67 last_was_space = false;
68 }
69 }
70 self.value = result;
71 }
72
73 pub fn text_length(&self) -> usize {
75 self.value.chars().count()
76 }
77
78 pub fn average_symbol_width(&self) -> f64 {
80 let len = self.text_length();
81 if len == 0 {
82 return 0.0;
83 }
84 self.bbox.width() / len as f64
85 }
86
87 pub fn symbol_start_coordinate(&self, idx: usize) -> f64 {
89 if idx == 0 {
90 self.bbox.left_x
91 } else if idx <= self.symbol_ends.len() {
92 self.symbol_ends[idx - 1]
93 } else {
94 self.bbox.right_x
95 }
96 }
97
98 pub fn symbol_end_coordinate(&self, idx: usize) -> f64 {
100 if idx < self.symbol_ends.len() {
101 self.symbol_ends[idx]
102 } else {
103 self.bbox.right_x
104 }
105 }
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct ImageChunk {
111 pub bbox: BoundingBox,
113 pub index: Option<u32>,
115 pub level: Option<String>,
117}
118
119#[derive(Debug, Clone, Serialize, Deserialize)]
121pub struct LineChunk {
122 pub bbox: BoundingBox,
124 pub index: Option<u32>,
126 pub level: Option<String>,
128 pub start: Vertex,
130 pub end: Vertex,
132 pub width: f64,
134 pub is_horizontal_line: bool,
136 pub is_vertical_line: bool,
138 pub is_square: bool,
140}
141
142#[derive(Debug, Clone, Serialize, Deserialize)]
144pub struct LineArtChunk {
145 pub bbox: BoundingBox,
147 pub index: Option<u32>,
149 pub level: Option<String>,
151 pub line_chunks: Vec<LineChunk>,
153}
154
155pub const LINE_ART_SIZE_EPSILON: f64 = 1.0;
157
158#[cfg(test)]
159mod tests {
160 use super::*;
161
162 fn make_text_chunk(value: &str) -> TextChunk {
163 TextChunk {
164 value: value.to_string(),
165 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 100.0, 12.0),
166 font_name: "Helvetica".to_string(),
167 font_size: 12.0,
168 font_weight: 400.0,
169 italic_angle: 0.0,
170 font_color: "#000000".to_string(),
171 contrast_ratio: 21.0,
172 symbol_ends: vec![],
173 text_format: TextFormat::Normal,
174 text_type: TextType::Regular,
175 pdf_layer: PdfLayer::Main,
176 ocg_visible: true,
177 index: None,
178 page_number: Some(1),
179 level: None,
180 mcid: None,
181 }
182 }
183
184 #[test]
185 fn test_is_white_space_chunk() {
186 assert!(make_text_chunk(" ").is_white_space_chunk());
187 assert!(!make_text_chunk("hello").is_white_space_chunk());
188 assert!(make_text_chunk("").is_white_space_chunk());
189 }
190
191 #[test]
192 fn test_compress_spaces() {
193 let mut chunk = make_text_chunk("hello world test");
194 chunk.compress_spaces();
195 assert_eq!(chunk.value, "hello world test");
196 }
197
198 #[test]
199 fn test_text_length() {
200 assert_eq!(make_text_chunk("hello").text_length(), 5);
201 assert_eq!(make_text_chunk("").text_length(), 0);
202 }
203
204 #[test]
205 fn test_average_symbol_width() {
206 let chunk = make_text_chunk("hello");
207 assert!((chunk.average_symbol_width() - 20.0).abs() < 0.01);
208 }
209}