1#[derive(Debug, Clone)]
8pub struct HocrWord {
9 pub text: String,
10 pub left: u32,
11 pub top: u32,
12 pub width: u32,
13 pub height: u32,
14 pub confidence: f64,
15}
16
17impl HocrWord {
18 pub fn right(&self) -> u32 {
20 self.left + self.width
21 }
22
23 pub fn bottom(&self) -> u32 {
25 self.top + self.height
26 }
27
28 pub fn y_center(&self) -> f64 {
30 self.top as f64 + (self.height as f64 / 2.0)
31 }
32
33 pub fn x_center(&self) -> f64 {
35 self.left as f64 + (self.width as f64 / 2.0)
36 }
37}
38
39fn parse_bbox(title: &str, debug: bool) -> Option<(u32, u32, u32, u32)> {
43 let known_attributes = [
44 "bbox",
45 "x_wconf",
46 "baseline",
47 "x_size",
48 "x_descenders",
49 "x_ascenders",
50 "textangle",
51 "poly",
52 "order",
53 "x_font",
54 "x_fsize",
55 "x_confs",
56 ];
57
58 for part in title.split(';') {
59 let part = part.trim();
60
61 if debug && !part.is_empty() {
62 let attr_name = part.split_whitespace().next().unwrap_or("");
63 if !known_attributes.iter().any(|&k| part.starts_with(k)) {
64 eprintln!("[hOCR] Info: Found unknown title attribute: '{}'", attr_name);
65 }
66 }
67
68 if let Some(bbox_str) = part.strip_prefix("bbox ") {
69 let coords: Vec<&str> = bbox_str.split_whitespace().collect();
70 if coords.len() == 4 {
71 if let (Ok(x1), Ok(y1), Ok(x2), Ok(y2)) = (
72 coords[0].parse::<u32>(),
73 coords[1].parse::<u32>(),
74 coords[2].parse::<u32>(),
75 coords[3].parse::<u32>(),
76 ) {
77 let width = x2.saturating_sub(x1);
78 let height = y2.saturating_sub(y1);
79 return Some((x1, y1, width, height));
80 }
81 }
82 }
83 }
84 None
85}
86
87fn parse_confidence(title: &str) -> f64 {
91 for part in title.split(';') {
92 let part = part.trim();
93 if let Some(conf_str) = part.strip_prefix("x_wconf ") {
94 if let Ok(conf) = conf_str.trim().parse::<f64>() {
95 return conf;
96 }
97 }
98 }
99 0.0
100}
101
102fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
104 let mut text = String::new();
105
106 if let Some(node) = node_handle.get(parser) {
107 match node {
108 tl::Node::Raw(bytes) => {
109 text.push_str(&bytes.as_utf8_str());
110 }
111 tl::Node::Tag(tag) => {
112 let children = tag.children();
113 for child_handle in children.top().iter() {
114 text.push_str(&get_text_content(child_handle, parser));
115 }
116 }
117 tl::Node::Comment(_) => {}
118 }
119 }
120
121 text
122}
123
124pub fn extract_hocr_words(
129 node_handle: &tl::NodeHandle,
130 parser: &tl::Parser,
131 min_confidence: f64,
132 debug: bool,
133) -> Vec<HocrWord> {
134 let mut words = Vec::new();
135
136 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
137 let tag_name = tag.name().as_utf8_str();
138 let attrs = tag.attributes();
139
140 let class_attr = attrs.get("class").flatten().map(|v| v.as_utf8_str().to_string());
141
142 if let Some(ref classes) = class_attr {
143 let known_classes = [
144 "ocr_page",
145 "ocr_carea",
146 "ocr_par",
147 "ocr_line",
148 "ocrx_word",
149 "ocr_header",
150 "ocr_footer",
151 "ocr_table",
152 "ocr_caption",
153 "ocr_textfloat",
154 "ocr_separator",
155 "ocr_noise",
156 ];
157
158 let class_list: Vec<&str> = classes.split_whitespace().collect();
159 let has_ocr_class = class_list.iter().any(|c| c.starts_with("ocr"));
160
161 if has_ocr_class && debug {
162 for class in &class_list {
163 if class.starts_with("ocr") && !known_classes.contains(class) {
164 eprintln!("[hOCR] Info: Found unhandled hOCR class '{}' on <{}>", class, tag_name);
165 }
166 }
167 }
168 }
169
170 if tag_name == "span" {
171 let is_word = class_attr.as_ref().is_some_and(|c| c.contains("ocrx_word"));
172 let title = attrs.get("title").flatten().map(|v| v.as_utf8_str());
173
174 if is_word {
175 let title_str = title.as_deref().unwrap_or("");
176 if let Some((left, top, width, height)) = parse_bbox(title_str, debug) {
177 let confidence = parse_confidence(title_str);
178
179 if confidence >= min_confidence {
180 let text = get_text_content(node_handle, parser).trim().to_string();
181
182 if !text.is_empty() {
183 words.push(HocrWord {
184 text,
185 left,
186 top,
187 width,
188 height,
189 confidence,
190 });
191 } else if debug {
192 eprintln!(
193 "[hOCR] Warning: ocrx_word element has no text content (bbox: {})",
194 title_str
195 );
196 }
197 } else if debug {
198 eprintln!(
199 "[hOCR] Warning: Word confidence ({:.1}) below threshold ({:.1}): {}",
200 confidence,
201 min_confidence,
202 get_text_content(node_handle, parser).trim()
203 );
204 }
205 } else if debug {
206 let text = get_text_content(node_handle, parser);
207 let trimmed = text.trim();
208 eprintln!(
209 "[hOCR] Warning: Failed to parse bbox for ocrx_word element: {} (title: {})",
210 if trimmed.is_empty() { "<empty>" } else { trimmed },
211 title_str
212 );
213 }
214 }
215 }
216
217 let children = tag.children();
218 for child_handle in children.top().iter() {
219 words.extend(extract_hocr_words(child_handle, parser, min_confidence, debug));
220 }
221 }
222
223 words
224}
225
226pub fn detect_columns(words: &[HocrWord], column_threshold: u32) -> Vec<u32> {
233 if words.is_empty() {
234 return Vec::new();
235 }
236
237 let mut x_positions: Vec<u32> = words.iter().map(|w| w.left).collect();
238 x_positions.sort_unstable();
239
240 let mut position_groups: Vec<Vec<u32>> = Vec::new();
241 let mut current_group = vec![x_positions[0]];
242
243 for &x_pos in &x_positions[1..] {
244 let matches_group = current_group.iter().any(|&pos| x_pos.abs_diff(pos) <= column_threshold);
245
246 if matches_group {
247 current_group.push(x_pos);
248 } else {
249 position_groups.push(std::mem::replace(&mut current_group, vec![x_pos]));
250 }
251 }
252
253 if !current_group.is_empty() {
254 position_groups.push(current_group);
255 }
256
257 let mut columns: Vec<u32> = position_groups
258 .iter()
259 .map(|group| {
260 let mid = group.len() / 2;
261 group[mid]
262 })
263 .collect();
264
265 columns.sort_unstable();
266 columns
267}
268
269pub fn detect_rows(words: &[HocrWord], row_threshold_ratio: f64) -> Vec<u32> {
276 if words.is_empty() {
277 return Vec::new();
278 }
279
280 let mut heights: Vec<u32> = words.iter().map(|w| w.height).collect();
281 heights.sort_unstable();
282 let median_height = heights[heights.len() / 2];
283 let row_threshold = median_height as f64 * row_threshold_ratio;
284
285 let mut y_centers: Vec<f64> = words.iter().map(|w| w.y_center()).collect();
286 y_centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
287
288 let mut position_groups: Vec<Vec<f64>> = Vec::new();
289 let mut current_group = vec![y_centers[0]];
290
291 for &y_center in &y_centers[1..] {
292 let matches_group = current_group.iter().any(|&pos| (y_center - pos).abs() <= row_threshold);
293
294 if matches_group {
295 current_group.push(y_center);
296 } else {
297 position_groups.push(std::mem::replace(&mut current_group, vec![y_center]));
298 }
299 }
300
301 if !current_group.is_empty() {
302 position_groups.push(current_group);
303 }
304
305 let mut rows: Vec<u32> = position_groups
306 .iter()
307 .map(|group| {
308 let mid = group.len() / 2;
309 group[mid] as u32
310 })
311 .collect();
312
313 rows.sort_unstable();
314 rows
315}
316
317pub fn reconstruct_table(
324 words: &[HocrWord],
325 column_threshold: u32,
326 row_threshold_ratio: f64,
327 debug: bool,
328) -> Vec<Vec<String>> {
329 if words.is_empty() {
330 if debug {
331 eprintln!("[hOCR] Warning: No words to reconstruct table from");
332 }
333 return Vec::new();
334 }
335
336 let col_positions = detect_columns(words, column_threshold);
337 let row_positions = detect_rows(words, row_threshold_ratio);
338
339 if col_positions.is_empty() || row_positions.is_empty() {
340 if debug {
341 eprintln!(
342 "[hOCR] Warning: Could not detect table structure (columns: {}, rows: {})",
343 col_positions.len(),
344 row_positions.len()
345 );
346 }
347 return Vec::new();
348 }
349
350 if debug {
351 eprintln!(
352 "[hOCR] Detected table structure: {} rows × {} columns",
353 row_positions.len(),
354 col_positions.len()
355 );
356 }
357
358 let num_rows = row_positions.len();
359 let num_cols = col_positions.len();
360 let mut table: Vec<Vec<Vec<String>>> = vec![vec![vec![]; num_cols]; num_rows];
361 let mut unassigned_words = 0;
362
363 for word in words {
364 if let (Some(r), Some(c)) = (
365 find_row_index(&row_positions, word),
366 find_column_index(&col_positions, word),
367 ) {
368 if r < num_rows && c < num_cols {
369 table[r][c].push(word.text.clone());
370 } else {
371 unassigned_words += 1;
372 if debug {
373 eprintln!(
374 "[hOCR] Warning: Word '{}' assigned to out-of-bounds cell ({}, {})",
375 word.text, r, c
376 );
377 }
378 }
379 } else {
380 unassigned_words += 1;
381 if debug {
382 eprintln!(
383 "[hOCR] Warning: Could not assign word '{}' to any cell (position: {}, {})",
384 word.text, word.left, word.top
385 );
386 }
387 }
388 }
389
390 if debug && unassigned_words > 0 {
391 eprintln!(
392 "[hOCR] Warning: {} out of {} words could not be assigned to table cells",
393 unassigned_words,
394 words.len()
395 );
396 }
397
398 let result: Vec<Vec<String>> = table
399 .into_iter()
400 .map(|row| {
401 row.into_iter()
402 .map(|cell_words| {
403 if cell_words.is_empty() {
404 String::new()
405 } else {
406 cell_words.join(" ")
407 }
408 })
409 .collect()
410 })
411 .collect();
412
413 remove_empty_rows_and_columns(result)
414}
415
416fn find_row_index(row_positions: &[u32], word: &HocrWord) -> Option<usize> {
418 let y_center = word.y_center() as u32;
419
420 row_positions
421 .iter()
422 .enumerate()
423 .min_by_key(|&(_, row_y)| row_y.abs_diff(y_center))
424 .map(|(idx, _)| idx)
425}
426
427fn find_column_index(col_positions: &[u32], word: &HocrWord) -> Option<usize> {
429 let x_pos = word.left;
430
431 col_positions
432 .iter()
433 .enumerate()
434 .min_by_key(|&(_, col_x)| col_x.abs_diff(x_pos))
435 .map(|(idx, _)| idx)
436}
437
438fn remove_empty_rows_and_columns(table: Vec<Vec<String>>) -> Vec<Vec<String>> {
440 if table.is_empty() {
441 return table;
442 }
443
444 let num_cols = table[0].len();
445 let mut non_empty_cols: Vec<bool> = vec![false; num_cols];
446
447 for row in &table {
448 for (col_idx, cell) in row.iter().enumerate() {
449 if !cell.trim().is_empty() {
450 non_empty_cols[col_idx] = true;
451 }
452 }
453 }
454
455 table
456 .into_iter()
457 .filter(|row| row.iter().any(|cell| !cell.trim().is_empty()))
458 .map(|row| {
459 row.into_iter()
460 .enumerate()
461 .filter(|(idx, _)| non_empty_cols[*idx])
462 .map(|(_, cell)| cell)
463 .collect()
464 })
465 .collect()
466}
467
468pub fn table_to_markdown(table: &[Vec<String>]) -> String {
470 if table.is_empty() {
471 return String::new();
472 }
473
474 let num_cols = table[0].len();
475 if num_cols == 0 {
476 return String::new();
477 }
478
479 let mut markdown = String::new();
480
481 for (row_idx, row) in table.iter().enumerate() {
482 markdown.push('|');
483 for cell in row {
484 markdown.push(' ');
485 markdown.push_str(&cell.replace('|', "\\|"));
486 markdown.push_str(" |");
487 }
488 markdown.push('\n');
489
490 if row_idx == 0 {
491 markdown.push('|');
492 for _ in 0..num_cols {
493 markdown.push_str(" --- |");
494 }
495 markdown.push('\n');
496 }
497 }
498
499 markdown
500}
501
502#[cfg(test)]
503mod tests {
504 use super::*;
505
506 #[test]
507 fn test_parse_bbox() {
508 assert_eq!(parse_bbox("bbox 100 50 180 80", false), Some((100, 50, 80, 30)));
509 assert_eq!(parse_bbox("bbox 0 0 100 200", false), Some((0, 0, 100, 200)));
510 assert_eq!(
511 parse_bbox("bbox 100 50 180 80; x_wconf 95", false),
512 Some((100, 50, 80, 30))
513 );
514 assert_eq!(parse_bbox("invalid", false), None);
515 assert_eq!(parse_bbox("bbox 100 50", false), None);
516 }
517
518 #[test]
519 fn test_parse_confidence() {
520 assert_eq!(parse_confidence("x_wconf 95.5"), 95.5);
521 assert_eq!(parse_confidence("bbox 100 50 180 80; x_wconf 92"), 92.0);
522 assert_eq!(parse_confidence("invalid"), 0.0);
523 }
524
525 #[test]
526 fn test_hocr_word_methods() {
527 let word = HocrWord {
528 text: "Hello".to_string(),
529 left: 100,
530 top: 50,
531 width: 80,
532 height: 30,
533 confidence: 95.5,
534 };
535
536 assert_eq!(word.right(), 180);
537 assert_eq!(word.bottom(), 80);
538 assert_eq!(word.y_center(), 65.0);
539 assert_eq!(word.x_center(), 140.0);
540 }
541
542 #[test]
543 fn test_detect_columns() {
544 let words = vec![
545 HocrWord {
546 text: "A".to_string(),
547 left: 100,
548 top: 50,
549 width: 20,
550 height: 30,
551 confidence: 95.0,
552 },
553 HocrWord {
554 text: "B".to_string(),
555 left: 200,
556 top: 50,
557 width: 20,
558 height: 30,
559 confidence: 95.0,
560 },
561 HocrWord {
562 text: "C".to_string(),
563 left: 105,
564 top: 100,
565 width: 20,
566 height: 30,
567 confidence: 95.0,
568 },
569 ];
570
571 let columns = detect_columns(&words, 50);
572 assert_eq!(columns.len(), 2);
573 assert!(columns.contains(&100) || columns.contains(&105));
574 assert!(columns.contains(&200));
575 }
576
577 #[test]
578 fn test_table_to_markdown() {
579 let table = vec![
580 vec!["Header1".to_string(), "Header2".to_string()],
581 vec!["Cell1".to_string(), "Cell2".to_string()],
582 ];
583
584 let markdown = table_to_markdown(&table);
585 assert!(markdown.contains("| Header1 | Header2 |"));
586 assert!(markdown.contains("| --- | --- |"));
587 assert!(markdown.contains("| Cell1 | Cell2 |"));
588 }
589
590 #[test]
591 fn test_table_to_markdown_escape_pipes() {
592 let table = vec![vec!["A|B".to_string(), "C".to_string()]];
593
594 let markdown = table_to_markdown(&table);
595 assert!(markdown.contains("A\\|B"));
596 }
597
598 #[test]
599 fn test_extract_hocr_words() {
600 let hocr = r#"
601 <div class="ocr_page">
602 <span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">Hello</span>
603 <span class="ocrx_word" title="bbox 160 50 210 80; x_wconf 92">World</span>
604 </div>
605 "#;
606
607 let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
608 let parser = dom.parser();
609
610 let mut words = Vec::new();
611 for child_handle in dom.children().iter() {
612 words.extend(extract_hocr_words(child_handle, parser, 0.0, false));
613 }
614
615 assert_eq!(words.len(), 2);
616 assert_eq!(words[0].text, "Hello");
617 assert_eq!(words[0].left, 100);
618 assert_eq!(words[0].confidence, 95.0);
619
620 assert_eq!(words[1].text, "World");
621 assert_eq!(words[1].left, 160);
622 assert_eq!(words[1].confidence, 92.0);
623 }
624
625 #[test]
626 fn test_extract_hocr_words_confidence_filter() {
627 let hocr = r#"
628 <div class="ocr_page">
629 <span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">HighConf</span>
630 <span class="ocrx_word" title="bbox 160 50 210 80; x_wconf 50">LowConf</span>
631 <span class="ocrx_word" title="bbox 220 50 270 80; x_wconf 98">VeryHigh</span>
632 </div>
633 "#;
634
635 let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
636 let parser = dom.parser();
637
638 let mut words = Vec::new();
639 for child_handle in dom.children().iter() {
640 words.extend(extract_hocr_words(child_handle, parser, 90.0, false));
641 }
642
643 assert_eq!(words.len(), 2);
644 assert_eq!(words[0].text, "HighConf");
645 assert_eq!(words[1].text, "VeryHigh");
646 }
647
648 #[test]
649 fn test_reconstruct_simple_table() {
650 let words = vec![
651 HocrWord {
652 text: "Name".to_string(),
653 left: 100,
654 top: 50,
655 width: 50,
656 height: 20,
657 confidence: 95.0,
658 },
659 HocrWord {
660 text: "Age".to_string(),
661 left: 200,
662 top: 50,
663 width: 50,
664 height: 20,
665 confidence: 95.0,
666 },
667 HocrWord {
668 text: "Alice".to_string(),
669 left: 100,
670 top: 100,
671 width: 50,
672 height: 20,
673 confidence: 95.0,
674 },
675 HocrWord {
676 text: "30".to_string(),
677 left: 200,
678 top: 100,
679 width: 50,
680 height: 20,
681 confidence: 95.0,
682 },
683 ];
684
685 let table = reconstruct_table(&words, 50, 0.5, false);
686
687 assert_eq!(table.len(), 2);
688 assert_eq!(table[0].len(), 2);
689 assert_eq!(table[0][0], "Name");
690 assert_eq!(table[0][1], "Age");
691 assert_eq!(table[1][0], "Alice");
692 assert_eq!(table[1][1], "30");
693 }
694
695 #[test]
696 fn test_reconstruct_table_with_multi_word_cells() {
697 let words = vec![
698 HocrWord {
699 text: "First".to_string(),
700 left: 100,
701 top: 50,
702 width: 30,
703 height: 20,
704 confidence: 95.0,
705 },
706 HocrWord {
707 text: "Name".to_string(),
708 left: 135,
709 top: 50,
710 width: 30,
711 height: 20,
712 confidence: 95.0,
713 },
714 HocrWord {
715 text: "Last".to_string(),
716 left: 200,
717 top: 50,
718 width: 30,
719 height: 20,
720 confidence: 95.0,
721 },
722 HocrWord {
723 text: "Name".to_string(),
724 left: 235,
725 top: 50,
726 width: 30,
727 height: 20,
728 confidence: 95.0,
729 },
730 ];
731
732 let table = reconstruct_table(&words, 50, 0.5, false);
733
734 assert_eq!(table.len(), 1);
735 assert_eq!(table[0].len(), 2);
736 assert_eq!(table[0][0], "First Name");
737 assert_eq!(table[0][1], "Last Name");
738 }
739
740 #[test]
741 fn test_end_to_end_hocr_table_extraction() {
742 let hocr = r#"
743 <div class="ocr_page">
744 <span class="ocrx_word" title="bbox 100 50 140 70; x_wconf 95">Product</span>
745 <span class="ocrx_word" title="bbox 200 50 240 70; x_wconf 95">Price</span>
746 <span class="ocrx_word" title="bbox 100 100 140 120; x_wconf 95">Apple</span>
747 <span class="ocrx_word" title="bbox 200 100 240 120; x_wconf 95">$1.50</span>
748 <span class="ocrx_word" title="bbox 100 150 140 170; x_wconf 95">Orange</span>
749 <span class="ocrx_word" title="bbox 200 150 240 170; x_wconf 95">$2.00</span>
750 </div>
751 "#;
752
753 let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
754 let parser = dom.parser();
755
756 let mut words = Vec::new();
757 for child_handle in dom.children().iter() {
758 words.extend(extract_hocr_words(child_handle, parser, 0.0, false));
759 }
760
761 let table = reconstruct_table(&words, 50, 0.5, false);
762 let markdown = table_to_markdown(&table);
763
764 assert_eq!(table.len(), 3);
765 assert_eq!(table[0][0], "Product");
766 assert_eq!(table[0][1], "Price");
767 assert_eq!(table[1][0], "Apple");
768 assert_eq!(table[1][1], "$1.50");
769 assert_eq!(table[2][0], "Orange");
770 assert_eq!(table[2][1], "$2.00");
771
772 assert!(markdown.contains("| Product | Price |"));
773 assert!(markdown.contains("| Apple | $1.50 |"));
774 assert!(markdown.contains("| Orange | $2.00 |"));
775 }
776}