html_to_markdown_rs/hocr/
spatial.rs

1//! Spatial table reconstruction from hOCR bounding box coordinates
2//!
3//! This module provides functions to detect and reconstruct tabular data from OCR'd text
4//! by analyzing the spatial positions of words using their bounding box (bbox) coordinates.
5
6/// Represents a word extracted from hOCR with position and confidence information
7#[derive(Debug, Clone)]
8pub struct HocrWord {
9    pub text: String,
10    pub left: u32,
11    pub top: u32,
12    pub width: u32,
13    pub height: u32,
14    pub confidence: f64,
15}
16
17impl HocrWord {
18    /// Get the right edge position
19    pub fn right(&self) -> u32 {
20        self.left + self.width
21    }
22
23    /// Get the bottom edge position
24    pub fn bottom(&self) -> u32 {
25        self.top + self.height
26    }
27
28    /// Get the vertical center position
29    pub fn y_center(&self) -> f64 {
30        self.top as f64 + (self.height as f64 / 2.0)
31    }
32
33    /// Get the horizontal center position
34    pub fn x_center(&self) -> f64 {
35        self.left as f64 + (self.width as f64 / 2.0)
36    }
37}
38
39/// Parse bbox attribute from hOCR title attribute
40///
41/// Example: "bbox 100 50 180 80; x_wconf 95" -> (100, 50, 80, 30)
42fn parse_bbox(title: &str, debug: bool) -> Option<(u32, u32, u32, u32)> {
43    let known_attributes = [
44        "bbox",
45        "x_wconf",
46        "baseline",
47        "x_size",
48        "x_descenders",
49        "x_ascenders",
50        "textangle",
51        "poly",
52        "order",
53        "x_font",
54        "x_fsize",
55        "x_confs",
56    ];
57
58    for part in title.split(';') {
59        let part = part.trim();
60
61        if debug && !part.is_empty() {
62            let attr_name = part.split_whitespace().next().unwrap_or("");
63            if !known_attributes.iter().any(|&k| part.starts_with(k)) {
64                eprintln!("[hOCR] Info: Found unknown title attribute: '{}'", attr_name);
65            }
66        }
67
68        if let Some(bbox_str) = part.strip_prefix("bbox ") {
69            let coords: Vec<&str> = bbox_str.split_whitespace().collect();
70            if coords.len() == 4 {
71                if let (Ok(x1), Ok(y1), Ok(x2), Ok(y2)) = (
72                    coords[0].parse::<u32>(),
73                    coords[1].parse::<u32>(),
74                    coords[2].parse::<u32>(),
75                    coords[3].parse::<u32>(),
76                ) {
77                    let width = x2.saturating_sub(x1);
78                    let height = y2.saturating_sub(y1);
79                    return Some((x1, y1, width, height));
80                }
81            }
82        }
83    }
84    None
85}
86
87/// Parse confidence from hOCR title attribute
88///
89/// Example: "bbox 100 50 180 80; x_wconf 95" -> 95.0
90fn parse_confidence(title: &str) -> f64 {
91    for part in title.split(';') {
92        let part = part.trim();
93        if let Some(conf_str) = part.strip_prefix("x_wconf ") {
94            if let Ok(conf) = conf_str.trim().parse::<f64>() {
95                return conf;
96            }
97        }
98    }
99    0.0
100}
101
102/// Extract text content from a node
103fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
104    let mut text = String::new();
105
106    if let Some(node) = node_handle.get(parser) {
107        match node {
108            tl::Node::Raw(bytes) => {
109                text.push_str(&bytes.as_utf8_str());
110            }
111            tl::Node::Tag(tag) => {
112                let children = tag.children();
113                for child_handle in children.top().iter() {
114                    text.push_str(&get_text_content(child_handle, parser));
115                }
116            }
117            tl::Node::Comment(_) => {}
118        }
119    }
120
121    text
122}
123
124/// Extract hOCR words from a DOM tree
125///
126/// Walks the DOM and extracts all elements with `ocrx_word` class,
127/// parsing their bbox and confidence information.
128pub fn extract_hocr_words(
129    node_handle: &tl::NodeHandle,
130    parser: &tl::Parser,
131    min_confidence: f64,
132    debug: bool,
133) -> Vec<HocrWord> {
134    let mut words = Vec::new();
135
136    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
137        let tag_name = tag.name().as_utf8_str();
138        let attrs = tag.attributes();
139
140        let class_attr = attrs.get("class").flatten().map(|v| v.as_utf8_str().to_string());
141
142        if let Some(ref classes) = class_attr {
143            let known_classes = [
144                "ocr_page",
145                "ocr_carea",
146                "ocr_par",
147                "ocr_line",
148                "ocrx_word",
149                "ocr_header",
150                "ocr_footer",
151                "ocr_table",
152                "ocr_caption",
153                "ocr_textfloat",
154                "ocr_separator",
155                "ocr_noise",
156            ];
157
158            let class_list: Vec<&str> = classes.split_whitespace().collect();
159            let has_ocr_class = class_list.iter().any(|c| c.starts_with("ocr"));
160
161            if has_ocr_class && debug {
162                for class in &class_list {
163                    if class.starts_with("ocr") && !known_classes.contains(class) {
164                        eprintln!("[hOCR] Info: Found unhandled hOCR class '{}' on <{}>", class, tag_name);
165                    }
166                }
167            }
168        }
169
170        if tag_name == "span" {
171            let is_word = class_attr.as_ref().is_some_and(|c| c.contains("ocrx_word"));
172            let title = attrs.get("title").flatten().map(|v| v.as_utf8_str());
173
174            if is_word {
175                let title_str = title.as_deref().unwrap_or("");
176                if let Some((left, top, width, height)) = parse_bbox(title_str, debug) {
177                    let confidence = parse_confidence(title_str);
178
179                    if confidence >= min_confidence {
180                        let text = get_text_content(node_handle, parser).trim().to_string();
181
182                        if !text.is_empty() {
183                            words.push(HocrWord {
184                                text,
185                                left,
186                                top,
187                                width,
188                                height,
189                                confidence,
190                            });
191                        } else if debug {
192                            eprintln!(
193                                "[hOCR] Warning: ocrx_word element has no text content (bbox: {})",
194                                title_str
195                            );
196                        }
197                    } else if debug {
198                        eprintln!(
199                            "[hOCR] Warning: Word confidence ({:.1}) below threshold ({:.1}): {}",
200                            confidence,
201                            min_confidence,
202                            get_text_content(node_handle, parser).trim()
203                        );
204                    }
205                } else if debug {
206                    let text = get_text_content(node_handle, parser);
207                    let trimmed = text.trim();
208                    eprintln!(
209                        "[hOCR] Warning: Failed to parse bbox for ocrx_word element: {} (title: {})",
210                        if trimmed.is_empty() { "<empty>" } else { trimmed },
211                        title_str
212                    );
213                }
214            }
215        }
216
217        let children = tag.children();
218        for child_handle in children.top().iter() {
219            words.extend(extract_hocr_words(child_handle, parser, min_confidence, debug));
220        }
221    }
222
223    words
224}
225
226/// Detect column positions from word positions
227///
228/// Groups words by their x-position and returns the median x-position
229/// for each detected column.
230///
231/// Optimized with O(n log n) complexity using sorted insertion.
232pub fn detect_columns(words: &[HocrWord], column_threshold: u32) -> Vec<u32> {
233    if words.is_empty() {
234        return Vec::new();
235    }
236
237    let mut x_positions: Vec<u32> = words.iter().map(|w| w.left).collect();
238    x_positions.sort_unstable();
239
240    let mut position_groups: Vec<Vec<u32>> = Vec::new();
241    let mut current_group = vec![x_positions[0]];
242
243    for &x_pos in &x_positions[1..] {
244        let matches_group = current_group.iter().any(|&pos| x_pos.abs_diff(pos) <= column_threshold);
245
246        if matches_group {
247            current_group.push(x_pos);
248        } else {
249            position_groups.push(std::mem::replace(&mut current_group, vec![x_pos]));
250        }
251    }
252
253    if !current_group.is_empty() {
254        position_groups.push(current_group);
255    }
256
257    let mut columns: Vec<u32> = position_groups
258        .iter()
259        .map(|group| {
260            let mid = group.len() / 2;
261            group[mid]
262        })
263        .collect();
264
265    columns.sort_unstable();
266    columns
267}
268
269/// Detect row positions from word positions
270///
271/// Groups words by their vertical center position and returns the median
272/// y-position for each detected row.
273///
274/// Optimized with O(n log n) complexity using sorted insertion.
275pub fn detect_rows(words: &[HocrWord], row_threshold_ratio: f64) -> Vec<u32> {
276    if words.is_empty() {
277        return Vec::new();
278    }
279
280    let mut heights: Vec<u32> = words.iter().map(|w| w.height).collect();
281    heights.sort_unstable();
282    let median_height = heights[heights.len() / 2];
283    let row_threshold = median_height as f64 * row_threshold_ratio;
284
285    let mut y_centers: Vec<f64> = words.iter().map(|w| w.y_center()).collect();
286    y_centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
287
288    let mut position_groups: Vec<Vec<f64>> = Vec::new();
289    let mut current_group = vec![y_centers[0]];
290
291    for &y_center in &y_centers[1..] {
292        let matches_group = current_group.iter().any(|&pos| (y_center - pos).abs() <= row_threshold);
293
294        if matches_group {
295            current_group.push(y_center);
296        } else {
297            position_groups.push(std::mem::replace(&mut current_group, vec![y_center]));
298        }
299    }
300
301    if !current_group.is_empty() {
302        position_groups.push(current_group);
303    }
304
305    let mut rows: Vec<u32> = position_groups
306        .iter()
307        .map(|group| {
308            let mid = group.len() / 2;
309            group[mid] as u32
310        })
311        .collect();
312
313    rows.sort_unstable();
314    rows
315}
316
317/// Reconstruct table structure from words
318///
319/// Takes detected words and reconstructs a 2D table by:
320/// 1. Detecting column and row positions
321/// 2. Assigning words to cells based on position
322/// 3. Combining words within the same cell
323pub fn reconstruct_table(
324    words: &[HocrWord],
325    column_threshold: u32,
326    row_threshold_ratio: f64,
327    debug: bool,
328) -> Vec<Vec<String>> {
329    if words.is_empty() {
330        if debug {
331            eprintln!("[hOCR] Warning: No words to reconstruct table from");
332        }
333        return Vec::new();
334    }
335
336    let col_positions = detect_columns(words, column_threshold);
337    let row_positions = detect_rows(words, row_threshold_ratio);
338
339    if col_positions.is_empty() || row_positions.is_empty() {
340        if debug {
341            eprintln!(
342                "[hOCR] Warning: Could not detect table structure (columns: {}, rows: {})",
343                col_positions.len(),
344                row_positions.len()
345            );
346        }
347        return Vec::new();
348    }
349
350    if debug {
351        eprintln!(
352            "[hOCR] Detected table structure: {} rows × {} columns",
353            row_positions.len(),
354            col_positions.len()
355        );
356    }
357
358    let num_rows = row_positions.len();
359    let num_cols = col_positions.len();
360    let mut table: Vec<Vec<Vec<String>>> = vec![vec![vec![]; num_cols]; num_rows];
361    let mut unassigned_words = 0;
362
363    for word in words {
364        if let (Some(r), Some(c)) = (
365            find_row_index(&row_positions, word),
366            find_column_index(&col_positions, word),
367        ) {
368            if r < num_rows && c < num_cols {
369                table[r][c].push(word.text.clone());
370            } else {
371                unassigned_words += 1;
372                if debug {
373                    eprintln!(
374                        "[hOCR] Warning: Word '{}' assigned to out-of-bounds cell ({}, {})",
375                        word.text, r, c
376                    );
377                }
378            }
379        } else {
380            unassigned_words += 1;
381            if debug {
382                eprintln!(
383                    "[hOCR] Warning: Could not assign word '{}' to any cell (position: {}, {})",
384                    word.text, word.left, word.top
385                );
386            }
387        }
388    }
389
390    if debug && unassigned_words > 0 {
391        eprintln!(
392            "[hOCR] Warning: {} out of {} words could not be assigned to table cells",
393            unassigned_words,
394            words.len()
395        );
396    }
397
398    let result: Vec<Vec<String>> = table
399        .into_iter()
400        .map(|row| {
401            row.into_iter()
402                .map(|cell_words| {
403                    if cell_words.is_empty() {
404                        String::new()
405                    } else {
406                        cell_words.join(" ")
407                    }
408                })
409                .collect()
410        })
411        .collect();
412
413    remove_empty_rows_and_columns(result)
414}
415
416/// Find which row a word belongs to based on its y-center
417fn find_row_index(row_positions: &[u32], word: &HocrWord) -> Option<usize> {
418    let y_center = word.y_center() as u32;
419
420    row_positions
421        .iter()
422        .enumerate()
423        .min_by_key(|&(_, row_y)| row_y.abs_diff(y_center))
424        .map(|(idx, _)| idx)
425}
426
427/// Find which column a word belongs to based on its x-position
428fn find_column_index(col_positions: &[u32], word: &HocrWord) -> Option<usize> {
429    let x_pos = word.left;
430
431    col_positions
432        .iter()
433        .enumerate()
434        .min_by_key(|&(_, col_x)| col_x.abs_diff(x_pos))
435        .map(|(idx, _)| idx)
436}
437
438/// Remove empty rows and columns from table
439fn remove_empty_rows_and_columns(table: Vec<Vec<String>>) -> Vec<Vec<String>> {
440    if table.is_empty() {
441        return table;
442    }
443
444    let num_cols = table[0].len();
445    let mut non_empty_cols: Vec<bool> = vec![false; num_cols];
446
447    for row in &table {
448        for (col_idx, cell) in row.iter().enumerate() {
449            if !cell.trim().is_empty() {
450                non_empty_cols[col_idx] = true;
451            }
452        }
453    }
454
455    table
456        .into_iter()
457        .filter(|row| row.iter().any(|cell| !cell.trim().is_empty()))
458        .map(|row| {
459            row.into_iter()
460                .enumerate()
461                .filter(|(idx, _)| non_empty_cols[*idx])
462                .map(|(_, cell)| cell)
463                .collect()
464        })
465        .collect()
466}
467
468/// Convert table to markdown format
469pub fn table_to_markdown(table: &[Vec<String>]) -> String {
470    if table.is_empty() {
471        return String::new();
472    }
473
474    let num_cols = table[0].len();
475    if num_cols == 0 {
476        return String::new();
477    }
478
479    let mut markdown = String::new();
480
481    for (row_idx, row) in table.iter().enumerate() {
482        markdown.push('|');
483        for cell in row {
484            markdown.push(' ');
485            markdown.push_str(&cell.replace('|', "\\|"));
486            markdown.push_str(" |");
487        }
488        markdown.push('\n');
489
490        if row_idx == 0 {
491            markdown.push('|');
492            for _ in 0..num_cols {
493                markdown.push_str(" --- |");
494            }
495            markdown.push('\n');
496        }
497    }
498
499    markdown
500}
501
502#[cfg(test)]
503mod tests {
504    use super::*;
505
506    #[test]
507    fn test_parse_bbox() {
508        assert_eq!(parse_bbox("bbox 100 50 180 80", false), Some((100, 50, 80, 30)));
509        assert_eq!(parse_bbox("bbox 0 0 100 200", false), Some((0, 0, 100, 200)));
510        assert_eq!(
511            parse_bbox("bbox 100 50 180 80; x_wconf 95", false),
512            Some((100, 50, 80, 30))
513        );
514        assert_eq!(parse_bbox("invalid", false), None);
515        assert_eq!(parse_bbox("bbox 100 50", false), None);
516    }
517
518    #[test]
519    fn test_parse_confidence() {
520        assert_eq!(parse_confidence("x_wconf 95.5"), 95.5);
521        assert_eq!(parse_confidence("bbox 100 50 180 80; x_wconf 92"), 92.0);
522        assert_eq!(parse_confidence("invalid"), 0.0);
523    }
524
525    #[test]
526    fn test_hocr_word_methods() {
527        let word = HocrWord {
528            text: "Hello".to_string(),
529            left: 100,
530            top: 50,
531            width: 80,
532            height: 30,
533            confidence: 95.5,
534        };
535
536        assert_eq!(word.right(), 180);
537        assert_eq!(word.bottom(), 80);
538        assert_eq!(word.y_center(), 65.0);
539        assert_eq!(word.x_center(), 140.0);
540    }
541
542    #[test]
543    fn test_detect_columns() {
544        let words = vec![
545            HocrWord {
546                text: "A".to_string(),
547                left: 100,
548                top: 50,
549                width: 20,
550                height: 30,
551                confidence: 95.0,
552            },
553            HocrWord {
554                text: "B".to_string(),
555                left: 200,
556                top: 50,
557                width: 20,
558                height: 30,
559                confidence: 95.0,
560            },
561            HocrWord {
562                text: "C".to_string(),
563                left: 105,
564                top: 100,
565                width: 20,
566                height: 30,
567                confidence: 95.0,
568            },
569        ];
570
571        let columns = detect_columns(&words, 50);
572        assert_eq!(columns.len(), 2);
573        assert!(columns.contains(&100) || columns.contains(&105));
574        assert!(columns.contains(&200));
575    }
576
577    #[test]
578    fn test_table_to_markdown() {
579        let table = vec![
580            vec!["Header1".to_string(), "Header2".to_string()],
581            vec!["Cell1".to_string(), "Cell2".to_string()],
582        ];
583
584        let markdown = table_to_markdown(&table);
585        assert!(markdown.contains("| Header1 | Header2 |"));
586        assert!(markdown.contains("| --- | --- |"));
587        assert!(markdown.contains("| Cell1 | Cell2 |"));
588    }
589
590    #[test]
591    fn test_table_to_markdown_escape_pipes() {
592        let table = vec![vec!["A|B".to_string(), "C".to_string()]];
593
594        let markdown = table_to_markdown(&table);
595        assert!(markdown.contains("A\\|B"));
596    }
597
598    #[test]
599    fn test_extract_hocr_words() {
600        let hocr = r#"
601            <div class="ocr_page">
602                <span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">Hello</span>
603                <span class="ocrx_word" title="bbox 160 50 210 80; x_wconf 92">World</span>
604            </div>
605        "#;
606
607        let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
608        let parser = dom.parser();
609
610        let mut words = Vec::new();
611        for child_handle in dom.children().iter() {
612            words.extend(extract_hocr_words(child_handle, parser, 0.0, false));
613        }
614
615        assert_eq!(words.len(), 2);
616        assert_eq!(words[0].text, "Hello");
617        assert_eq!(words[0].left, 100);
618        assert_eq!(words[0].confidence, 95.0);
619
620        assert_eq!(words[1].text, "World");
621        assert_eq!(words[1].left, 160);
622        assert_eq!(words[1].confidence, 92.0);
623    }
624
625    #[test]
626    fn test_extract_hocr_words_confidence_filter() {
627        let hocr = r#"
628            <div class="ocr_page">
629                <span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">HighConf</span>
630                <span class="ocrx_word" title="bbox 160 50 210 80; x_wconf 50">LowConf</span>
631                <span class="ocrx_word" title="bbox 220 50 270 80; x_wconf 98">VeryHigh</span>
632            </div>
633        "#;
634
635        let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
636        let parser = dom.parser();
637
638        let mut words = Vec::new();
639        for child_handle in dom.children().iter() {
640            words.extend(extract_hocr_words(child_handle, parser, 90.0, false));
641        }
642
643        assert_eq!(words.len(), 2);
644        assert_eq!(words[0].text, "HighConf");
645        assert_eq!(words[1].text, "VeryHigh");
646    }
647
648    #[test]
649    fn test_reconstruct_simple_table() {
650        let words = vec![
651            HocrWord {
652                text: "Name".to_string(),
653                left: 100,
654                top: 50,
655                width: 50,
656                height: 20,
657                confidence: 95.0,
658            },
659            HocrWord {
660                text: "Age".to_string(),
661                left: 200,
662                top: 50,
663                width: 50,
664                height: 20,
665                confidence: 95.0,
666            },
667            HocrWord {
668                text: "Alice".to_string(),
669                left: 100,
670                top: 100,
671                width: 50,
672                height: 20,
673                confidence: 95.0,
674            },
675            HocrWord {
676                text: "30".to_string(),
677                left: 200,
678                top: 100,
679                width: 50,
680                height: 20,
681                confidence: 95.0,
682            },
683        ];
684
685        let table = reconstruct_table(&words, 50, 0.5, false);
686
687        assert_eq!(table.len(), 2);
688        assert_eq!(table[0].len(), 2);
689        assert_eq!(table[0][0], "Name");
690        assert_eq!(table[0][1], "Age");
691        assert_eq!(table[1][0], "Alice");
692        assert_eq!(table[1][1], "30");
693    }
694
695    #[test]
696    fn test_reconstruct_table_with_multi_word_cells() {
697        let words = vec![
698            HocrWord {
699                text: "First".to_string(),
700                left: 100,
701                top: 50,
702                width: 30,
703                height: 20,
704                confidence: 95.0,
705            },
706            HocrWord {
707                text: "Name".to_string(),
708                left: 135,
709                top: 50,
710                width: 30,
711                height: 20,
712                confidence: 95.0,
713            },
714            HocrWord {
715                text: "Last".to_string(),
716                left: 200,
717                top: 50,
718                width: 30,
719                height: 20,
720                confidence: 95.0,
721            },
722            HocrWord {
723                text: "Name".to_string(),
724                left: 235,
725                top: 50,
726                width: 30,
727                height: 20,
728                confidence: 95.0,
729            },
730        ];
731
732        let table = reconstruct_table(&words, 50, 0.5, false);
733
734        assert_eq!(table.len(), 1);
735        assert_eq!(table[0].len(), 2);
736        assert_eq!(table[0][0], "First Name");
737        assert_eq!(table[0][1], "Last Name");
738    }
739
740    #[test]
741    fn test_end_to_end_hocr_table_extraction() {
742        let hocr = r#"
743            <div class="ocr_page">
744                <span class="ocrx_word" title="bbox 100 50 140 70; x_wconf 95">Product</span>
745                <span class="ocrx_word" title="bbox 200 50 240 70; x_wconf 95">Price</span>
746                <span class="ocrx_word" title="bbox 100 100 140 120; x_wconf 95">Apple</span>
747                <span class="ocrx_word" title="bbox 200 100 240 120; x_wconf 95">$1.50</span>
748                <span class="ocrx_word" title="bbox 100 150 140 170; x_wconf 95">Orange</span>
749                <span class="ocrx_word" title="bbox 200 150 240 170; x_wconf 95">$2.00</span>
750            </div>
751        "#;
752
753        let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
754        let parser = dom.parser();
755
756        let mut words = Vec::new();
757        for child_handle in dom.children().iter() {
758            words.extend(extract_hocr_words(child_handle, parser, 0.0, false));
759        }
760
761        let table = reconstruct_table(&words, 50, 0.5, false);
762        let markdown = table_to_markdown(&table);
763
764        assert_eq!(table.len(), 3);
765        assert_eq!(table[0][0], "Product");
766        assert_eq!(table[0][1], "Price");
767        assert_eq!(table[1][0], "Apple");
768        assert_eq!(table[1][1], "$1.50");
769        assert_eq!(table[2][0], "Orange");
770        assert_eq!(table[2][1], "$2.00");
771
772        assert!(markdown.contains("| Product | Price |"));
773        assert!(markdown.contains("| Apple | $1.50 |"));
774        assert!(markdown.contains("| Orange | $2.00 |"));
775    }
776}