1use serde::{Deserialize, Serialize};
4
5use super::bbox::BoundingBox;
6use super::chunks::{LineArtChunk, TextChunk};
7use super::enums::TextAlignment;
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct TextLine {
12 pub bbox: BoundingBox,
14 pub index: Option<u32>,
16 pub level: Option<String>,
18 pub font_size: f64,
20 pub base_line: f64,
22 pub slant_degree: f64,
24 pub is_hidden_text: bool,
26 pub text_chunks: Vec<TextChunk>,
28 pub is_line_start: bool,
30 pub is_line_end: bool,
32 pub is_list_line: bool,
34 pub connected_line_art_label: Option<LineArtChunk>,
36}
37
38impl TextLine {
39 pub fn value(&self) -> String {
51 let real_chunks: Vec<&TextChunk> = self
53 .text_chunks
54 .iter()
55 .filter(|c| !c.value.is_empty() && !c.is_white_space_chunk())
56 .collect();
57
58 Self::concatenate_chunk_refs(&real_chunks)
59 }
60
61 pub fn concatenate_chunks(chunks: &[TextChunk]) -> String {
67 let filtered: Vec<&TextChunk> = chunks
68 .iter()
69 .filter(|c| !c.value.is_empty() && !c.is_white_space_chunk())
70 .collect();
71
72 if filtered.len() < 2 {
73 return Self::concatenate_chunk_refs(&filtered);
74 }
75
76 let mut groups: Vec<Vec<&TextChunk>> = Vec::new();
79 let mut current_group: Vec<&TextChunk> = vec![filtered[0]];
80
81 for i in 1..filtered.len() {
82 let prev = filtered[i - 1];
83 let curr = filtered[i];
84 let y_diff = (curr.bbox.top_y - prev.bbox.top_y).abs();
85 let font_size = prev.font_size.max(curr.font_size).max(1.0);
86 if y_diff > font_size * 0.5 {
88 groups.push(std::mem::take(&mut current_group));
89 current_group = vec![curr];
90 } else {
91 current_group.push(curr);
92 }
93 }
94 groups.push(current_group);
95
96 if groups.len() == 1 {
97 return Self::concatenate_chunk_refs(&groups[0]);
98 }
99
100 groups
102 .iter()
103 .map(|g| Self::concatenate_chunk_refs(g))
104 .filter(|s| !s.is_empty())
105 .collect::<Vec<_>>()
106 .join(" ")
107 }
108
109 fn concatenate_chunk_refs(real_chunks: &[&TextChunk]) -> String {
111 if real_chunks.is_empty() {
112 return String::new();
113 }
114 if real_chunks.len() == 1 {
115 return Self::collapse_letter_spaced(&real_chunks[0].value);
116 }
117
118 let adaptive_threshold = if real_chunks.len() >= 5 {
121 let single_char_count = real_chunks
122 .iter()
123 .filter(|c| c.value.chars().count() == 1)
124 .count();
125 if single_char_count * 10 >= real_chunks.len() * 7 {
126 let mut gaps: Vec<f64> = Vec::new();
128 for i in 1..real_chunks.len() {
129 let gap = real_chunks[i].bbox.left_x - real_chunks[i - 1].bbox.right_x;
130 if gap > 0.0 {
131 gaps.push(gap);
132 }
133 }
134 if gaps.len() >= 3 {
135 gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
136 let median = gaps[gaps.len() / 2];
137 Some(median * 1.8)
138 } else {
139 Some(f64::MAX)
141 }
142 } else {
143 None
144 }
145 } else {
146 None
147 };
148
149 let mut result = String::with_capacity(
150 real_chunks.iter().map(|c| c.value.len()).sum::<usize>() + real_chunks.len(),
151 );
152 result.push_str(&real_chunks[0].value);
153
154 for i in 1..real_chunks.len() {
155 let prev = real_chunks[i - 1];
156 let curr = real_chunks[i];
157
158 if let Some(threshold) = adaptive_threshold {
159 let gap = curr.bbox.left_x - prev.bbox.right_x;
162 if gap > threshold {
163 result.push(' ');
164 }
165 } else if Self::needs_space(prev, curr) {
166 result.push(' ');
167 }
168 result.push_str(&curr.value);
169 }
170 result
171 }
172
173 fn needs_space(prev: &super::chunks::TextChunk, curr: &super::chunks::TextChunk) -> bool {
176 if prev.value.ends_with(' ') || curr.value.starts_with(' ') {
178 return false;
179 }
180 if prev.value.is_empty() || curr.value.is_empty() {
182 return false;
183 }
184
185 let gap = curr.bbox.left_x - prev.bbox.right_x;
186
187 if gap <= 0.0 {
189 return false;
190 }
191
192 let font_size = prev.font_size.max(curr.font_size).max(1.0);
198 let threshold = font_size * 0.17;
199
200 gap > threshold
201 }
202
203 fn collapse_letter_spaced(text: &str) -> String {
209 let tokens: Vec<&str> = text.split(' ').collect();
210 if tokens.len() < 5 {
211 return text.to_string();
212 }
213
214 let non_empty: Vec<&str> = tokens.iter().copied().filter(|t| !t.is_empty()).collect();
215 if non_empty.len() < 4 {
216 return text.to_string();
217 }
218
219 let single_alpha = non_empty
220 .iter()
221 .filter(|t| {
222 let mut chars = t.chars();
223 matches!(chars.next(), Some(c) if c.is_alphabetic()) && chars.next().is_none()
224 })
225 .count();
226
227 if single_alpha < 4 || single_alpha * 10 < non_empty.len() * 6 {
228 return text.to_string();
229 }
230
231 let mut result = String::new();
232 for token in &tokens {
233 if token.is_empty() {
234 if !result.is_empty() && !result.ends_with(' ') {
236 result.push(' ');
237 }
238 continue;
239 }
240 let is_single_alpha = {
241 let mut chars = token.chars();
242 matches!(chars.next(), Some(c) if c.is_alphabetic()) && chars.next().is_none()
243 };
244 if is_single_alpha {
245 result.push_str(token);
246 } else {
247 if !result.is_empty() && !result.ends_with(' ') {
248 result.push(' ');
249 }
250 result.push_str(token);
251 }
252 }
253 result.trim().to_string()
254 }
255
256 pub fn chunk_count(&self) -> usize {
258 self.text_chunks.len()
259 }
260}
261
262#[derive(Debug, Clone, Serialize, Deserialize)]
264pub struct TextBlock {
265 pub bbox: BoundingBox,
267 pub index: Option<u32>,
269 pub level: Option<String>,
271 pub font_size: f64,
273 pub base_line: f64,
275 pub slant_degree: f64,
277 pub is_hidden_text: bool,
279 pub text_lines: Vec<TextLine>,
281 pub has_start_line: bool,
283 pub has_end_line: bool,
285 pub text_alignment: Option<TextAlignment>,
287}
288
289impl TextBlock {
290 pub fn value(&self) -> String {
295 let line_values: Vec<String> = self.text_lines.iter().map(|l| l.value()).collect();
296 if line_values.is_empty() {
297 return String::new();
298 }
299
300 let mut result = String::new();
301 for (i, line) in line_values.iter().enumerate() {
302 let trimmed = line.trim_end();
303 if i > 0 {
304 if result.ends_with('-') {
306 let before_hyphen = result[..result.len() - 1].chars().last();
308 if before_hyphen.is_some_and(|c| c.is_alphabetic()) {
309 result.pop(); } else {
312 result.push(' ');
313 }
314 } else {
315 result.push(' ');
316 }
317 }
318 result.push_str(trimmed);
319 }
320 result
321 }
322
323 pub fn lines_count(&self) -> usize {
325 self.text_lines.len()
326 }
327}
328
329#[derive(Debug, Clone, Serialize, Deserialize)]
331pub struct TextColumn {
332 pub bbox: BoundingBox,
334 pub index: Option<u32>,
336 pub level: Option<String>,
338 pub font_size: f64,
340 pub base_line: f64,
342 pub slant_degree: f64,
344 pub is_hidden_text: bool,
346 pub text_blocks: Vec<TextBlock>,
348}
349
350impl TextColumn {
351 pub fn value(&self) -> String {
353 self.text_blocks
354 .iter()
355 .map(|b| b.value())
356 .collect::<Vec<_>>()
357 .join("\n")
358 }
359}
360
361#[cfg(test)]
362mod tests {
363 use super::*;
364 use crate::models::chunks::TextChunk;
365 use crate::models::enums::{PdfLayer, TextFormat, TextType};
366
367 fn make_text_line(text: &str) -> TextLine {
368 TextLine {
369 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 100.0, 12.0),
370 index: None,
371 level: None,
372 font_size: 12.0,
373 base_line: 2.0,
374 slant_degree: 0.0,
375 is_hidden_text: false,
376 text_chunks: vec![TextChunk {
377 value: text.to_string(),
378 bbox: BoundingBox::new(Some(1), 0.0, 0.0, 100.0, 12.0),
379 font_name: "Helvetica".to_string(),
380 font_size: 12.0,
381 font_weight: 400.0,
382 italic_angle: 0.0,
383 font_color: "#000000".to_string(),
384 contrast_ratio: 21.0,
385 symbol_ends: vec![],
386 text_format: TextFormat::Normal,
387 text_type: TextType::Regular,
388 pdf_layer: PdfLayer::Main,
389 ocg_visible: true,
390 index: None,
391 page_number: Some(1),
392 level: None,
393 mcid: None,
394 }],
395 is_line_start: false,
396 is_line_end: false,
397 is_list_line: false,
398 connected_line_art_label: None,
399 }
400 }
401
402 #[test]
403 fn test_text_line_value() {
404 let line = make_text_line("Hello World");
405 assert_eq!(line.value(), "Hello World");
406 assert_eq!(line.chunk_count(), 1);
407 }
408}