1pub mod converter;
25pub mod extractor;
26pub mod parser;
27pub mod types;
28
29pub use converter::convert_to_markdown;
31pub use extractor::extract_hocr_document;
32pub use types::{BBox, Baseline, HocrElement, HocrElementType, HocrMetadata, HocrProperties};
33
34#[derive(Debug, Clone)]
36pub struct HocrWord {
37 pub text: String,
38 pub left: u32,
39 pub top: u32,
40 pub width: u32,
41 pub height: u32,
42 pub confidence: f64,
43}
44
45impl HocrWord {
46 pub fn right(&self) -> u32 {
48 self.left + self.width
49 }
50
51 pub fn bottom(&self) -> u32 {
53 self.top + self.height
54 }
55
56 pub fn y_center(&self) -> f64 {
58 self.top as f64 + (self.height as f64 / 2.0)
59 }
60
61 pub fn x_center(&self) -> f64 {
63 self.left as f64 + (self.width as f64 / 2.0)
64 }
65}
66
67fn parse_bbox(title: &str, debug: bool) -> Option<(u32, u32, u32, u32)> {
71 let known_attributes = [
72 "bbox",
73 "x_wconf",
74 "baseline",
75 "x_size",
76 "x_descenders",
77 "x_ascenders",
78 "textangle",
79 "poly",
80 "order",
81 "x_font",
82 "x_fsize",
83 "x_confs",
84 ];
85
86 for part in title.split(';') {
87 let part = part.trim();
88
89 if debug && !part.is_empty() {
90 let attr_name = part.split_whitespace().next().unwrap_or("");
91 if !known_attributes.iter().any(|&k| part.starts_with(k)) {
92 eprintln!("[hOCR] Info: Found unknown title attribute: '{}'", attr_name);
93 }
94 }
95
96 if let Some(bbox_str) = part.strip_prefix("bbox ") {
97 let coords: Vec<&str> = bbox_str.split_whitespace().collect();
98 if coords.len() == 4 {
99 if let (Ok(x1), Ok(y1), Ok(x2), Ok(y2)) = (
100 coords[0].parse::<u32>(),
101 coords[1].parse::<u32>(),
102 coords[2].parse::<u32>(),
103 coords[3].parse::<u32>(),
104 ) {
105 let width = x2.saturating_sub(x1);
106 let height = y2.saturating_sub(y1);
107 return Some((x1, y1, width, height));
108 }
109 }
110 }
111 }
112 None
113}
114
115fn parse_confidence(title: &str) -> f64 {
119 for part in title.split(';') {
120 let part = part.trim();
121 if let Some(conf_str) = part.strip_prefix("x_wconf ") {
122 if let Ok(conf) = conf_str.trim().parse::<f64>() {
123 return conf;
124 }
125 }
126 }
127 0.0
128}
129
130fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
132 let mut text = String::new();
133
134 if let Some(node) = node_handle.get(parser) {
135 match node {
136 tl::Node::Raw(bytes) => {
137 text.push_str(&bytes.as_utf8_str());
138 }
139 tl::Node::Tag(tag) => {
140 let children = tag.children();
141 for child_handle in children.top().iter() {
142 text.push_str(&get_text_content(child_handle, parser));
143 }
144 }
145 tl::Node::Comment(_) => {}
146 }
147 }
148
149 text
150}
151
152pub fn extract_hocr_words(
157 node_handle: &tl::NodeHandle,
158 parser: &tl::Parser,
159 min_confidence: f64,
160 debug: bool,
161) -> Vec<HocrWord> {
162 let mut words = Vec::new();
163
164 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
165 let tag_name = tag.name().as_utf8_str();
166 let attrs = tag.attributes();
167
168 let class_attr = attrs.get("class").flatten().map(|v| v.as_utf8_str().to_string());
169
170 if let Some(ref classes) = class_attr {
171 let known_classes = [
172 "ocr_page",
173 "ocr_carea",
174 "ocr_par",
175 "ocr_line",
176 "ocrx_word",
177 "ocr_header",
178 "ocr_footer",
179 "ocr_table",
180 "ocr_caption",
181 "ocr_textfloat",
182 "ocr_separator",
183 "ocr_noise",
184 ];
185
186 let class_list: Vec<&str> = classes.split_whitespace().collect();
187 let has_ocr_class = class_list.iter().any(|c| c.starts_with("ocr"));
188
189 if has_ocr_class && debug {
190 for class in &class_list {
191 if class.starts_with("ocr") && !known_classes.contains(class) {
192 eprintln!("[hOCR] Info: Found unhandled hOCR class '{}' on <{}>", class, tag_name);
193 }
194 }
195 }
196 }
197
198 if tag_name == "span" {
199 let is_word = class_attr.as_ref().is_some_and(|c| c.contains("ocrx_word"));
200 let title = attrs.get("title").flatten().map(|v| v.as_utf8_str());
201
202 if is_word {
203 let title_str = title.as_deref().unwrap_or("");
204 if let Some((left, top, width, height)) = parse_bbox(title_str, debug) {
205 let confidence = parse_confidence(title_str);
206
207 if confidence >= min_confidence {
208 let text = get_text_content(node_handle, parser).trim().to_string();
209
210 if !text.is_empty() {
211 words.push(HocrWord {
212 text,
213 left,
214 top,
215 width,
216 height,
217 confidence,
218 });
219 } else if debug {
220 eprintln!(
221 "[hOCR] Warning: ocrx_word element has no text content (bbox: {})",
222 title_str
223 );
224 }
225 } else if debug {
226 eprintln!(
227 "[hOCR] Warning: Word confidence ({:.1}) below threshold ({:.1}): {}",
228 confidence,
229 min_confidence,
230 get_text_content(node_handle, parser).trim()
231 );
232 }
233 } else if debug {
234 let text = get_text_content(node_handle, parser);
235 let trimmed = text.trim();
236 eprintln!(
237 "[hOCR] Warning: Failed to parse bbox for ocrx_word element: {} (title: {})",
238 if trimmed.is_empty() { "<empty>" } else { trimmed },
239 title_str
240 );
241 }
242 }
243 }
244
245 let children = tag.children();
246 for child_handle in children.top().iter() {
247 words.extend(extract_hocr_words(child_handle, parser, min_confidence, debug));
248 }
249 }
250
251 words
252}
253
254pub fn detect_columns(words: &[HocrWord], column_threshold: u32) -> Vec<u32> {
261 if words.is_empty() {
262 return Vec::new();
263 }
264
265 let mut x_positions: Vec<u32> = words.iter().map(|w| w.left).collect();
266 x_positions.sort_unstable();
267
268 let mut position_groups: Vec<Vec<u32>> = Vec::new();
269 let mut current_group = vec![x_positions[0]];
270
271 for &x_pos in &x_positions[1..] {
272 let matches_group = current_group.iter().any(|&pos| x_pos.abs_diff(pos) <= column_threshold);
273
274 if matches_group {
275 current_group.push(x_pos);
276 } else {
277 position_groups.push(std::mem::replace(&mut current_group, vec![x_pos]));
278 }
279 }
280
281 if !current_group.is_empty() {
282 position_groups.push(current_group);
283 }
284
285 let mut columns: Vec<u32> = position_groups
286 .iter()
287 .map(|group| {
288 let mid = group.len() / 2;
289 group[mid]
290 })
291 .collect();
292
293 columns.sort_unstable();
294 columns
295}
296
297pub fn detect_rows(words: &[HocrWord], row_threshold_ratio: f64) -> Vec<u32> {
304 if words.is_empty() {
305 return Vec::new();
306 }
307
308 let mut heights: Vec<u32> = words.iter().map(|w| w.height).collect();
309 heights.sort_unstable();
310 let median_height = heights[heights.len() / 2];
311 let row_threshold = median_height as f64 * row_threshold_ratio;
312
313 let mut y_centers: Vec<f64> = words.iter().map(|w| w.y_center()).collect();
314 y_centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
315
316 let mut position_groups: Vec<Vec<f64>> = Vec::new();
317 let mut current_group = vec![y_centers[0]];
318
319 for &y_center in &y_centers[1..] {
320 let matches_group = current_group.iter().any(|&pos| (y_center - pos).abs() <= row_threshold);
321
322 if matches_group {
323 current_group.push(y_center);
324 } else {
325 position_groups.push(std::mem::replace(&mut current_group, vec![y_center]));
326 }
327 }
328
329 if !current_group.is_empty() {
330 position_groups.push(current_group);
331 }
332
333 let mut rows: Vec<u32> = position_groups
334 .iter()
335 .map(|group| {
336 let mid = group.len() / 2;
337 group[mid] as u32
338 })
339 .collect();
340
341 rows.sort_unstable();
342 rows
343}
344
345pub fn reconstruct_table(
352 words: &[HocrWord],
353 column_threshold: u32,
354 row_threshold_ratio: f64,
355 debug: bool,
356) -> Vec<Vec<String>> {
357 if words.is_empty() {
358 if debug {
359 eprintln!("[hOCR] Warning: No words to reconstruct table from");
360 }
361 return Vec::new();
362 }
363
364 let col_positions = detect_columns(words, column_threshold);
365 let row_positions = detect_rows(words, row_threshold_ratio);
366
367 if col_positions.is_empty() || row_positions.is_empty() {
368 if debug {
369 eprintln!(
370 "[hOCR] Warning: Could not detect table structure (columns: {}, rows: {})",
371 col_positions.len(),
372 row_positions.len()
373 );
374 }
375 return Vec::new();
376 }
377
378 if debug {
379 eprintln!(
380 "[hOCR] Detected table structure: {} rows × {} columns",
381 row_positions.len(),
382 col_positions.len()
383 );
384 }
385
386 let num_rows = row_positions.len();
387 let num_cols = col_positions.len();
388 let mut table: Vec<Vec<Vec<String>>> = vec![vec![vec![]; num_cols]; num_rows];
389 let mut unassigned_words = 0;
390
391 for word in words {
392 if let (Some(r), Some(c)) = (
393 find_row_index(&row_positions, word),
394 find_column_index(&col_positions, word),
395 ) {
396 if r < num_rows && c < num_cols {
397 table[r][c].push(word.text.clone());
398 } else {
399 unassigned_words += 1;
400 if debug {
401 eprintln!(
402 "[hOCR] Warning: Word '{}' assigned to out-of-bounds cell ({}, {})",
403 word.text, r, c
404 );
405 }
406 }
407 } else {
408 unassigned_words += 1;
409 if debug {
410 eprintln!(
411 "[hOCR] Warning: Could not assign word '{}' to any cell (position: {}, {})",
412 word.text, word.left, word.top
413 );
414 }
415 }
416 }
417
418 if debug && unassigned_words > 0 {
419 eprintln!(
420 "[hOCR] Warning: {} out of {} words could not be assigned to table cells",
421 unassigned_words,
422 words.len()
423 );
424 }
425
426 let result: Vec<Vec<String>> = table
427 .into_iter()
428 .map(|row| {
429 row.into_iter()
430 .map(|cell_words| {
431 if cell_words.is_empty() {
432 String::new()
433 } else {
434 cell_words.join(" ")
435 }
436 })
437 .collect()
438 })
439 .collect();
440
441 remove_empty_rows_and_columns(result)
442}
443
444fn find_row_index(row_positions: &[u32], word: &HocrWord) -> Option<usize> {
446 let y_center = word.y_center() as u32;
447
448 row_positions
449 .iter()
450 .enumerate()
451 .min_by_key(|&(_, row_y)| row_y.abs_diff(y_center))
452 .map(|(idx, _)| idx)
453}
454
455fn find_column_index(col_positions: &[u32], word: &HocrWord) -> Option<usize> {
457 let x_pos = word.left;
458
459 col_positions
460 .iter()
461 .enumerate()
462 .min_by_key(|&(_, col_x)| col_x.abs_diff(x_pos))
463 .map(|(idx, _)| idx)
464}
465
466fn remove_empty_rows_and_columns(table: Vec<Vec<String>>) -> Vec<Vec<String>> {
468 if table.is_empty() {
469 return table;
470 }
471
472 let num_cols = table[0].len();
473 let mut non_empty_cols: Vec<bool> = vec![false; num_cols];
474
475 for row in &table {
476 for (col_idx, cell) in row.iter().enumerate() {
477 if !cell.trim().is_empty() {
478 non_empty_cols[col_idx] = true;
479 }
480 }
481 }
482
483 table
484 .into_iter()
485 .filter(|row| row.iter().any(|cell| !cell.trim().is_empty()))
486 .map(|row| {
487 row.into_iter()
488 .enumerate()
489 .filter(|(idx, _)| non_empty_cols[*idx])
490 .map(|(_, cell)| cell)
491 .collect()
492 })
493 .collect()
494}
495
496pub fn table_to_markdown(table: &[Vec<String>]) -> String {
498 if table.is_empty() {
499 return String::new();
500 }
501
502 let num_cols = table[0].len();
503 if num_cols == 0 {
504 return String::new();
505 }
506
507 let mut markdown = String::new();
508
509 for (row_idx, row) in table.iter().enumerate() {
510 markdown.push('|');
511 for cell in row {
512 markdown.push(' ');
513 markdown.push_str(&cell.replace('|', "\\|"));
514 markdown.push_str(" |");
515 }
516 markdown.push('\n');
517
518 if row_idx == 0 {
519 markdown.push('|');
520 for _ in 0..num_cols {
521 markdown.push_str(" --- |");
522 }
523 markdown.push('\n');
524 }
525 }
526
527 markdown
528}
529
530#[cfg(test)]
531mod tests {
532 use super::*;
533
534 #[test]
535 fn test_parse_bbox() {
536 assert_eq!(parse_bbox("bbox 100 50 180 80", false), Some((100, 50, 80, 30)));
537 assert_eq!(parse_bbox("bbox 0 0 100 200", false), Some((0, 0, 100, 200)));
538 assert_eq!(
539 parse_bbox("bbox 100 50 180 80; x_wconf 95", false),
540 Some((100, 50, 80, 30))
541 );
542 assert_eq!(parse_bbox("invalid", false), None);
543 assert_eq!(parse_bbox("bbox 100 50", false), None);
544 }
545
546 #[test]
547 fn test_parse_confidence() {
548 assert_eq!(parse_confidence("x_wconf 95.5"), 95.5);
549 assert_eq!(parse_confidence("bbox 100 50 180 80; x_wconf 92"), 92.0);
550 assert_eq!(parse_confidence("invalid"), 0.0);
551 }
552
553 #[test]
554 fn test_hocr_word_methods() {
555 let word = HocrWord {
556 text: "Hello".to_string(),
557 left: 100,
558 top: 50,
559 width: 80,
560 height: 30,
561 confidence: 95.5,
562 };
563
564 assert_eq!(word.right(), 180);
565 assert_eq!(word.bottom(), 80);
566 assert_eq!(word.y_center(), 65.0);
567 assert_eq!(word.x_center(), 140.0);
568 }
569
570 #[test]
571 fn test_detect_columns() {
572 let words = vec![
573 HocrWord {
574 text: "A".to_string(),
575 left: 100,
576 top: 50,
577 width: 20,
578 height: 30,
579 confidence: 95.0,
580 },
581 HocrWord {
582 text: "B".to_string(),
583 left: 200,
584 top: 50,
585 width: 20,
586 height: 30,
587 confidence: 95.0,
588 },
589 HocrWord {
590 text: "C".to_string(),
591 left: 105,
592 top: 100,
593 width: 20,
594 height: 30,
595 confidence: 95.0,
596 },
597 ];
598
599 let columns = detect_columns(&words, 50);
600 assert_eq!(columns.len(), 2);
601 assert!(columns.contains(&100) || columns.contains(&105));
602 assert!(columns.contains(&200));
603 }
604
605 #[test]
606 fn test_table_to_markdown() {
607 let table = vec![
608 vec!["Header1".to_string(), "Header2".to_string()],
609 vec!["Cell1".to_string(), "Cell2".to_string()],
610 ];
611
612 let markdown = table_to_markdown(&table);
613 assert!(markdown.contains("| Header1 | Header2 |"));
614 assert!(markdown.contains("| --- | --- |"));
615 assert!(markdown.contains("| Cell1 | Cell2 |"));
616 }
617
618 #[test]
619 fn test_table_to_markdown_escape_pipes() {
620 let table = vec![vec!["A|B".to_string(), "C".to_string()]];
621
622 let markdown = table_to_markdown(&table);
623 assert!(markdown.contains("A\\|B"));
624 }
625
626 #[test]
627 fn test_extract_hocr_words() {
628 let hocr = r#"
629 <div class="ocr_page">
630 <span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">Hello</span>
631 <span class="ocrx_word" title="bbox 160 50 210 80; x_wconf 92">World</span>
632 </div>
633 "#;
634
635 let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
636 let parser = dom.parser();
637
638 let mut words = Vec::new();
639 for child_handle in dom.children().iter() {
640 words.extend(extract_hocr_words(child_handle, parser, 0.0, false));
641 }
642
643 assert_eq!(words.len(), 2);
644 assert_eq!(words[0].text, "Hello");
645 assert_eq!(words[0].left, 100);
646 assert_eq!(words[0].confidence, 95.0);
647
648 assert_eq!(words[1].text, "World");
649 assert_eq!(words[1].left, 160);
650 assert_eq!(words[1].confidence, 92.0);
651 }
652
653 #[test]
654 fn test_extract_hocr_words_confidence_filter() {
655 let hocr = r#"
656 <div class="ocr_page">
657 <span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">HighConf</span>
658 <span class="ocrx_word" title="bbox 160 50 210 80; x_wconf 50">LowConf</span>
659 <span class="ocrx_word" title="bbox 220 50 270 80; x_wconf 98">VeryHigh</span>
660 </div>
661 "#;
662
663 let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
664 let parser = dom.parser();
665
666 let mut words = Vec::new();
667 for child_handle in dom.children().iter() {
668 words.extend(extract_hocr_words(child_handle, parser, 90.0, false));
669 }
670
671 assert_eq!(words.len(), 2);
672 assert_eq!(words[0].text, "HighConf");
673 assert_eq!(words[1].text, "VeryHigh");
674 }
675
676 #[test]
677 fn test_reconstruct_simple_table() {
678 let words = vec![
679 HocrWord {
680 text: "Name".to_string(),
681 left: 100,
682 top: 50,
683 width: 50,
684 height: 20,
685 confidence: 95.0,
686 },
687 HocrWord {
688 text: "Age".to_string(),
689 left: 200,
690 top: 50,
691 width: 50,
692 height: 20,
693 confidence: 95.0,
694 },
695 HocrWord {
696 text: "Alice".to_string(),
697 left: 100,
698 top: 100,
699 width: 50,
700 height: 20,
701 confidence: 95.0,
702 },
703 HocrWord {
704 text: "30".to_string(),
705 left: 200,
706 top: 100,
707 width: 50,
708 height: 20,
709 confidence: 95.0,
710 },
711 ];
712
713 let table = reconstruct_table(&words, 50, 0.5, false);
714
715 assert_eq!(table.len(), 2);
716 assert_eq!(table[0].len(), 2);
717 assert_eq!(table[0][0], "Name");
718 assert_eq!(table[0][1], "Age");
719 assert_eq!(table[1][0], "Alice");
720 assert_eq!(table[1][1], "30");
721 }
722
723 #[test]
724 fn test_reconstruct_table_with_multi_word_cells() {
725 let words = vec![
726 HocrWord {
727 text: "First".to_string(),
728 left: 100,
729 top: 50,
730 width: 30,
731 height: 20,
732 confidence: 95.0,
733 },
734 HocrWord {
735 text: "Name".to_string(),
736 left: 135,
737 top: 50,
738 width: 30,
739 height: 20,
740 confidence: 95.0,
741 },
742 HocrWord {
743 text: "Last".to_string(),
744 left: 200,
745 top: 50,
746 width: 30,
747 height: 20,
748 confidence: 95.0,
749 },
750 HocrWord {
751 text: "Name".to_string(),
752 left: 235,
753 top: 50,
754 width: 30,
755 height: 20,
756 confidence: 95.0,
757 },
758 ];
759
760 let table = reconstruct_table(&words, 50, 0.5, false);
761
762 assert_eq!(table.len(), 1);
763 assert_eq!(table[0].len(), 2);
764 assert_eq!(table[0][0], "First Name");
765 assert_eq!(table[0][1], "Last Name");
766 }
767
768 #[test]
769 fn test_end_to_end_hocr_table_extraction() {
770 let hocr = r#"
771 <div class="ocr_page">
772 <span class="ocrx_word" title="bbox 100 50 140 70; x_wconf 95">Product</span>
773 <span class="ocrx_word" title="bbox 200 50 240 70; x_wconf 95">Price</span>
774 <span class="ocrx_word" title="bbox 100 100 140 120; x_wconf 95">Apple</span>
775 <span class="ocrx_word" title="bbox 200 100 240 120; x_wconf 95">$1.50</span>
776 <span class="ocrx_word" title="bbox 100 150 140 170; x_wconf 95">Orange</span>
777 <span class="ocrx_word" title="bbox 200 150 240 170; x_wconf 95">$2.00</span>
778 </div>
779 "#;
780
781 let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
782 let parser = dom.parser();
783
784 let mut words = Vec::new();
785 for child_handle in dom.children().iter() {
786 words.extend(extract_hocr_words(child_handle, parser, 0.0, false));
787 }
788
789 let table = reconstruct_table(&words, 50, 0.5, false);
790 let markdown = table_to_markdown(&table);
791
792 assert_eq!(table.len(), 3);
793 assert_eq!(table[0][0], "Product");
794 assert_eq!(table[0][1], "Price");
795 assert_eq!(table[1][0], "Apple");
796 assert_eq!(table[1][1], "$1.50");
797 assert_eq!(table[2][0], "Orange");
798 assert_eq!(table[2][1], "$2.00");
799
800 assert!(markdown.contains("| Product | Price |"));
801 assert!(markdown.contains("| Apple | $1.50 |"));
802 assert!(markdown.contains("| Orange | $2.00 |"));
803 }
804}