edgeparse_core/models/
text.rs

1//! Text grouping types — TextLine, TextBlock, TextColumn.
2
3use serde::{Deserialize, Serialize};
4
5use super::bbox::BoundingBox;
6use super::chunks::{LineArtChunk, TextChunk};
7use super::enums::TextAlignment;
8
9/// A horizontal group of TextChunks sharing a baseline.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct TextLine {
12    /// Bounding box
13    pub bbox: BoundingBox,
14    /// Global index
15    pub index: Option<u32>,
16    /// Nesting level
17    pub level: Option<String>,
18    /// Dominant font size
19    pub font_size: f64,
20    /// Baseline Y coordinate
21    pub base_line: f64,
22    /// Slant degree
23    pub slant_degree: f64,
24    /// Whether all text is hidden
25    pub is_hidden_text: bool,
26    /// Component text chunks
27    pub text_chunks: Vec<TextChunk>,
28    /// Whether this line starts a new paragraph
29    pub is_line_start: bool,
30    /// Whether this line ends a paragraph
31    pub is_line_end: bool,
32    /// Whether this line is part of a list
33    pub is_list_line: bool,
34    /// Connected line art (bullet marker)
35    pub connected_line_art_label: Option<LineArtChunk>,
36}
37
38impl TextLine {
39    /// Concatenated text value of all chunks, inserting spaces between
40    /// chunks when a horizontal gap indicates a word boundary.
41    ///
42    /// Whitespace-only chunks are skipped (matching the reference processTextLines
43    /// which skips `isWhiteSpaceChunk()` chunks); word spaces are re-detected
44    /// from bounding-box gaps via `needs_space()`.
45    ///
46    /// For letter-spaced text (≥70% of chunks are single-character), an adaptive
47    /// gap threshold based on the median inter-chunk gap is used instead of the
48    /// fixed `fontSize * 0.17` rule. This correctly collapses text like
49    /// `"H O W  C A N"` into `"HOW CAN"`.
50    pub fn value(&self) -> String {
51        // Filter to non-whitespace, non-empty chunks (reference behaviour).
52        let real_chunks: Vec<&TextChunk> = self
53            .text_chunks
54            .iter()
55            .filter(|c| !c.value.is_empty() && !c.is_white_space_chunk())
56            .collect();
57
58        Self::concatenate_chunk_refs(&real_chunks)
59    }
60
61    /// Concatenate a slice of owned TextChunks using gap-based word boundary
62    /// detection.  Handles letter-spaced text with adaptive threshold.
63    ///
64    /// For multi-line content (e.g. table cells), chunks on different visual
65    /// lines are separated by spaces — detected via Y-position change.
66    pub fn concatenate_chunks(chunks: &[TextChunk]) -> String {
67        let filtered: Vec<&TextChunk> = chunks
68            .iter()
69            .filter(|c| !c.value.is_empty() && !c.is_white_space_chunk())
70            .collect();
71
72        if filtered.len() < 2 {
73            return Self::concatenate_chunk_refs(&filtered);
74        }
75
76        // Split into same-line groups based on Y position, then concatenate
77        // each group with gap-based logic and join groups with spaces.
78        let mut groups: Vec<Vec<&TextChunk>> = Vec::new();
79        let mut current_group: Vec<&TextChunk> = vec![filtered[0]];
80
81        for i in 1..filtered.len() {
82            let prev = filtered[i - 1];
83            let curr = filtered[i];
84            let y_diff = (curr.bbox.top_y - prev.bbox.top_y).abs();
85            let font_size = prev.font_size.max(curr.font_size).max(1.0);
86            // If Y changes by more than half the font size, it's a new visual line.
87            if y_diff > font_size * 0.5 {
88                groups.push(std::mem::take(&mut current_group));
89                current_group = vec![curr];
90            } else {
91                current_group.push(curr);
92            }
93        }
94        groups.push(current_group);
95
96        if groups.len() == 1 {
97            return Self::concatenate_chunk_refs(&groups[0]);
98        }
99
100        // Concatenate each group separately and join with spaces.
101        groups
102            .iter()
103            .map(|g| Self::concatenate_chunk_refs(g))
104            .filter(|s| !s.is_empty())
105            .collect::<Vec<_>>()
106            .join(" ")
107    }
108
109    /// Core gap-based concatenation logic for a pre-ordered slice of chunk refs.
110    fn concatenate_chunk_refs(real_chunks: &[&TextChunk]) -> String {
111        if real_chunks.is_empty() {
112            return String::new();
113        }
114        if real_chunks.len() == 1 {
115            return Self::collapse_letter_spaced(&real_chunks[0].value);
116        }
117
118        // Detect letter-spaced lines: ≥70% of chunks are single characters
119        // and there are at least 5 chunks.
120        let adaptive_threshold = if real_chunks.len() >= 5 {
121            let single_char_count = real_chunks
122                .iter()
123                .filter(|c| c.value.chars().count() == 1)
124                .count();
125            if single_char_count * 10 >= real_chunks.len() * 7 {
126                // Compute median positive gap to determine the typical letter-spacing.
127                let mut gaps: Vec<f64> = Vec::new();
128                for i in 1..real_chunks.len() {
129                    let gap = real_chunks[i].bbox.left_x - real_chunks[i - 1].bbox.right_x;
130                    if gap > 0.0 {
131                        gaps.push(gap);
132                    }
133                }
134                if gaps.len() >= 3 {
135                    gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
136                    let median = gaps[gaps.len() / 2];
137                    Some(median * 1.8)
138                } else {
139                    // Too few gaps to compute median; fall back to collapsing all
140                    Some(f64::MAX)
141                }
142            } else {
143                None
144            }
145        } else {
146            None
147        };
148
149        let mut result = String::with_capacity(
150            real_chunks.iter().map(|c| c.value.len()).sum::<usize>() + real_chunks.len(),
151        );
152        result.push_str(&real_chunks[0].value);
153
154        for i in 1..real_chunks.len() {
155            let prev = real_chunks[i - 1];
156            let curr = real_chunks[i];
157
158            if let Some(threshold) = adaptive_threshold {
159                // For letter-spaced lines, only insert a space when the gap
160                // is significantly larger than the typical letter spacing.
161                let gap = curr.bbox.left_x - prev.bbox.right_x;
162                if gap > threshold {
163                    result.push(' ');
164                }
165            } else if Self::needs_space(prev, curr) {
166                result.push(' ');
167            }
168            result.push_str(&curr.value);
169        }
170        result
171    }
172
173    /// Determine if a space is needed between two adjacent chunks.
174    /// Uses `fontSize * 0.17` threshold (TEXT_LINE_SPACE_RATIO constant).
175    fn needs_space(prev: &super::chunks::TextChunk, curr: &super::chunks::TextChunk) -> bool {
176        // If either already has boundary whitespace, skip
177        if prev.value.ends_with(' ') || curr.value.starts_with(' ') {
178            return false;
179        }
180        // If either is empty, no space needed
181        if prev.value.is_empty() || curr.value.is_empty() {
182            return false;
183        }
184
185        let gap = curr.bbox.left_x - prev.bbox.right_x;
186
187        // If overlapping or touching, no space
188        if gap <= 0.0 {
189            return false;
190        }
191
192        // TEXT_LINE_SPACE_RATIO = 0.17.  After the pre-merge step
193        // (merge_close_text_chunks), adjacent same-style fragments with small
194        // gaps have been unified.  Remaining gaps represent actual word
195        // boundaries or style changes, so 0.17 works correctly on bounding-box
196        // coordinates.
197        let font_size = prev.font_size.max(curr.font_size).max(1.0);
198        let threshold = font_size * 0.17;
199
200        gap > threshold
201    }
202
203    /// Collapse letter-spaced text within a single string.
204    ///
205    /// Detects strings where ≥60% of space-separated tokens are single
206    /// alphabetic characters (min 4). Consecutive single-char tokens are
207    /// joined; double spaces and multi-char tokens act as word boundaries.
208    fn collapse_letter_spaced(text: &str) -> String {
209        let tokens: Vec<&str> = text.split(' ').collect();
210        if tokens.len() < 5 {
211            return text.to_string();
212        }
213
214        let non_empty: Vec<&str> = tokens.iter().copied().filter(|t| !t.is_empty()).collect();
215        if non_empty.len() < 4 {
216            return text.to_string();
217        }
218
219        let single_alpha = non_empty
220            .iter()
221            .filter(|t| {
222                let mut chars = t.chars();
223                matches!(chars.next(), Some(c) if c.is_alphabetic()) && chars.next().is_none()
224            })
225            .count();
226
227        if single_alpha < 4 || single_alpha * 10 < non_empty.len() * 6 {
228            return text.to_string();
229        }
230
231        let mut result = String::new();
232        for token in &tokens {
233            if token.is_empty() {
234                // Double space → word boundary.
235                if !result.is_empty() && !result.ends_with(' ') {
236                    result.push(' ');
237                }
238                continue;
239            }
240            let is_single_alpha = {
241                let mut chars = token.chars();
242                matches!(chars.next(), Some(c) if c.is_alphabetic()) && chars.next().is_none()
243            };
244            if is_single_alpha {
245                result.push_str(token);
246            } else {
247                if !result.is_empty() && !result.ends_with(' ') {
248                    result.push(' ');
249                }
250                result.push_str(token);
251            }
252        }
253        result.trim().to_string()
254    }
255
256    /// Number of text chunks in this line.
257    pub fn chunk_count(&self) -> usize {
258        self.text_chunks.len()
259    }
260}
261
262/// A vertical group of TextLines forming a text block.
263#[derive(Debug, Clone, Serialize, Deserialize)]
264pub struct TextBlock {
265    /// Bounding box
266    pub bbox: BoundingBox,
267    /// Global index
268    pub index: Option<u32>,
269    /// Nesting level
270    pub level: Option<String>,
271    /// Dominant font size
272    pub font_size: f64,
273    /// Baseline Y coordinate
274    pub base_line: f64,
275    /// Slant degree
276    pub slant_degree: f64,
277    /// Whether all text is hidden
278    pub is_hidden_text: bool,
279    /// Component text lines
280    pub text_lines: Vec<TextLine>,
281    /// Whether block starts with a new paragraph
282    pub has_start_line: bool,
283    /// Whether block ends a paragraph
284    pub has_end_line: bool,
285    /// Detected text alignment
286    pub text_alignment: Option<TextAlignment>,
287}
288
289impl TextBlock {
290    /// Concatenated text value of all lines.
291    ///
292    /// Joins lines with spaces, handling end-of-line hyphenation by removing
293    /// the trailing hyphen and joining the word directly.
294    pub fn value(&self) -> String {
295        let line_values: Vec<String> = self.text_lines.iter().map(|l| l.value()).collect();
296        if line_values.is_empty() {
297            return String::new();
298        }
299
300        let mut result = String::new();
301        for (i, line) in line_values.iter().enumerate() {
302            let trimmed = line.trim_end();
303            if i > 0 {
304                // If the previous line ended with a hyphen, remove it and join directly
305                if result.ends_with('-') {
306                    // Check it's a real hyphenation (lowercase letter before hyphen)
307                    let before_hyphen = result[..result.len() - 1].chars().last();
308                    if before_hyphen.is_some_and(|c| c.is_alphabetic()) {
309                        result.pop(); // Remove the hyphen
310                                      // Don't add a space — the word continues
311                    } else {
312                        result.push(' ');
313                    }
314                } else {
315                    result.push(' ');
316                }
317            }
318            result.push_str(trimmed);
319        }
320        result
321    }
322
323    /// Total number of lines.
324    pub fn lines_count(&self) -> usize {
325        self.text_lines.len()
326    }
327}
328
329/// A vertical group of TextBlocks.
330#[derive(Debug, Clone, Serialize, Deserialize)]
331pub struct TextColumn {
332    /// Bounding box
333    pub bbox: BoundingBox,
334    /// Global index
335    pub index: Option<u32>,
336    /// Nesting level
337    pub level: Option<String>,
338    /// Dominant font size
339    pub font_size: f64,
340    /// Baseline Y coordinate
341    pub base_line: f64,
342    /// Slant degree
343    pub slant_degree: f64,
344    /// Whether all text is hidden
345    pub is_hidden_text: bool,
346    /// Component text blocks
347    pub text_blocks: Vec<TextBlock>,
348}
349
350impl TextColumn {
351    /// Concatenated text value of all blocks.
352    pub fn value(&self) -> String {
353        self.text_blocks
354            .iter()
355            .map(|b| b.value())
356            .collect::<Vec<_>>()
357            .join("\n")
358    }
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364    use crate::models::chunks::TextChunk;
365    use crate::models::enums::{PdfLayer, TextFormat, TextType};
366
367    fn make_text_line(text: &str) -> TextLine {
368        TextLine {
369            bbox: BoundingBox::new(Some(1), 0.0, 0.0, 100.0, 12.0),
370            index: None,
371            level: None,
372            font_size: 12.0,
373            base_line: 2.0,
374            slant_degree: 0.0,
375            is_hidden_text: false,
376            text_chunks: vec![TextChunk {
377                value: text.to_string(),
378                bbox: BoundingBox::new(Some(1), 0.0, 0.0, 100.0, 12.0),
379                font_name: "Helvetica".to_string(),
380                font_size: 12.0,
381                font_weight: 400.0,
382                italic_angle: 0.0,
383                font_color: "#000000".to_string(),
384                contrast_ratio: 21.0,
385                symbol_ends: vec![],
386                text_format: TextFormat::Normal,
387                text_type: TextType::Regular,
388                pdf_layer: PdfLayer::Main,
389                ocg_visible: true,
390                index: None,
391                page_number: Some(1),
392                level: None,
393                mcid: None,
394            }],
395            is_line_start: false,
396            is_line_end: false,
397            is_list_line: false,
398            connected_line_art_label: None,
399        }
400    }
401
402    #[test]
403    fn test_text_line_value() {
404        let line = make_text_line("Hello World");
405        assert_eq!(line.value(), "Hello World");
406        assert_eq!(line.chunk_count(), 1);
407    }
408}
edgeparse_core/models/text.rs

edgeparse_core/models/
text.rs