html_to_markdown_rs/
hocr.rs

1//! hOCR 1.2 document processing
2//!
3//! Complete hOCR 1.2 specification support for extracting structured content from OCR documents.
4//!
5//! ## Features
6//!
7//! - **Full Element Support**: All 40+ hOCR 1.2 element types
8//! - **Complete Property Parsing**: All 20+ hOCR properties (bbox, baseline, fonts, etc.)
9//! - **Document Structure**: Logical hierarchy (paragraphs, sections, chapters)
10//! - **Table Extraction**: Spatial layout analysis for tabular data
11//! - **Metadata Extraction**: OCR system info, capabilities, languages
12//!
13//! ## Modules
14//!
15//! - [`types`]: Core hOCR element and property types
16//! - [`parser`]: Property parsing from title attributes
17//! - [`extractor`]: DOM to hOCR element tree extraction
18//! - [`converter`]: hOCR to Markdown conversion
19//!
20//! ## Legacy Table Extraction
21//!
22//! The original table extraction API is maintained for backward compatibility.
23
24pub mod converter;
25pub mod extractor;
26pub mod parser;
27pub mod types;
28
29// Re-export main types
30pub use converter::convert_to_markdown;
31pub use extractor::extract_hocr_document;
32pub use types::{BBox, Baseline, HocrElement, HocrElementType, HocrMetadata, HocrProperties};
33
34/// Represents a word extracted from hOCR with position and confidence information
35#[derive(Debug, Clone)]
36pub struct HocrWord {
37    pub text: String,
38    pub left: u32,
39    pub top: u32,
40    pub width: u32,
41    pub height: u32,
42    pub confidence: f64,
43}
44
45impl HocrWord {
46    /// Get the right edge position
47    pub fn right(&self) -> u32 {
48        self.left + self.width
49    }
50
51    /// Get the bottom edge position
52    pub fn bottom(&self) -> u32 {
53        self.top + self.height
54    }
55
56    /// Get the vertical center position
57    pub fn y_center(&self) -> f64 {
58        self.top as f64 + (self.height as f64 / 2.0)
59    }
60
61    /// Get the horizontal center position
62    pub fn x_center(&self) -> f64 {
63        self.left as f64 + (self.width as f64 / 2.0)
64    }
65}
66
67/// Parse bbox attribute from hOCR title attribute
68///
69/// Example: "bbox 100 50 180 80; x_wconf 95" -> (100, 50, 80, 30)
70fn parse_bbox(title: &str, debug: bool) -> Option<(u32, u32, u32, u32)> {
71    let known_attributes = [
72        "bbox",
73        "x_wconf",
74        "baseline",
75        "x_size",
76        "x_descenders",
77        "x_ascenders",
78        "textangle",
79        "poly",
80        "order",
81        "x_font",
82        "x_fsize",
83        "x_confs",
84    ];
85
86    for part in title.split(';') {
87        let part = part.trim();
88
89        if debug && !part.is_empty() {
90            let attr_name = part.split_whitespace().next().unwrap_or("");
91            if !known_attributes.iter().any(|&k| part.starts_with(k)) {
92                eprintln!("[hOCR] Info: Found unknown title attribute: '{}'", attr_name);
93            }
94        }
95
96        if let Some(bbox_str) = part.strip_prefix("bbox ") {
97            let coords: Vec<&str> = bbox_str.split_whitespace().collect();
98            if coords.len() == 4 {
99                if let (Ok(x1), Ok(y1), Ok(x2), Ok(y2)) = (
100                    coords[0].parse::<u32>(),
101                    coords[1].parse::<u32>(),
102                    coords[2].parse::<u32>(),
103                    coords[3].parse::<u32>(),
104                ) {
105                    let width = x2.saturating_sub(x1);
106                    let height = y2.saturating_sub(y1);
107                    return Some((x1, y1, width, height));
108                }
109            }
110        }
111    }
112    None
113}
114
115/// Parse confidence from hOCR title attribute
116///
117/// Example: "bbox 100 50 180 80; x_wconf 95" -> 95.0
118fn parse_confidence(title: &str) -> f64 {
119    for part in title.split(';') {
120        let part = part.trim();
121        if let Some(conf_str) = part.strip_prefix("x_wconf ") {
122            if let Ok(conf) = conf_str.trim().parse::<f64>() {
123                return conf;
124            }
125        }
126    }
127    0.0
128}
129
130/// Extract text content from a node
131fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
132    let mut text = String::new();
133
134    if let Some(node) = node_handle.get(parser) {
135        match node {
136            tl::Node::Raw(bytes) => {
137                text.push_str(&bytes.as_utf8_str());
138            }
139            tl::Node::Tag(tag) => {
140                let children = tag.children();
141                for child_handle in children.top().iter() {
142                    text.push_str(&get_text_content(child_handle, parser));
143                }
144            }
145            tl::Node::Comment(_) => {}
146        }
147    }
148
149    text
150}
151
152/// Extract hOCR words from a DOM tree
153///
154/// Walks the DOM and extracts all elements with `ocrx_word` class,
155/// parsing their bbox and confidence information.
156pub fn extract_hocr_words(
157    node_handle: &tl::NodeHandle,
158    parser: &tl::Parser,
159    min_confidence: f64,
160    debug: bool,
161) -> Vec<HocrWord> {
162    let mut words = Vec::new();
163
164    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
165        let tag_name = tag.name().as_utf8_str();
166        let attrs = tag.attributes();
167
168        let class_attr = attrs.get("class").flatten().map(|v| v.as_utf8_str().to_string());
169
170        if let Some(ref classes) = class_attr {
171            let known_classes = [
172                "ocr_page",
173                "ocr_carea",
174                "ocr_par",
175                "ocr_line",
176                "ocrx_word",
177                "ocr_header",
178                "ocr_footer",
179                "ocr_table",
180                "ocr_caption",
181                "ocr_textfloat",
182                "ocr_separator",
183                "ocr_noise",
184            ];
185
186            let class_list: Vec<&str> = classes.split_whitespace().collect();
187            let has_ocr_class = class_list.iter().any(|c| c.starts_with("ocr"));
188
189            if has_ocr_class && debug {
190                for class in &class_list {
191                    if class.starts_with("ocr") && !known_classes.contains(class) {
192                        eprintln!("[hOCR] Info: Found unhandled hOCR class '{}' on <{}>", class, tag_name);
193                    }
194                }
195            }
196        }
197
198        if tag_name == "span" {
199            let is_word = class_attr.as_ref().is_some_and(|c| c.contains("ocrx_word"));
200            let title = attrs.get("title").flatten().map(|v| v.as_utf8_str());
201
202            if is_word {
203                let title_str = title.as_deref().unwrap_or("");
204                if let Some((left, top, width, height)) = parse_bbox(title_str, debug) {
205                    let confidence = parse_confidence(title_str);
206
207                    if confidence >= min_confidence {
208                        let text = get_text_content(node_handle, parser).trim().to_string();
209
210                        if !text.is_empty() {
211                            words.push(HocrWord {
212                                text,
213                                left,
214                                top,
215                                width,
216                                height,
217                                confidence,
218                            });
219                        } else if debug {
220                            eprintln!(
221                                "[hOCR] Warning: ocrx_word element has no text content (bbox: {})",
222                                title_str
223                            );
224                        }
225                    } else if debug {
226                        eprintln!(
227                            "[hOCR] Warning: Word confidence ({:.1}) below threshold ({:.1}): {}",
228                            confidence,
229                            min_confidence,
230                            get_text_content(node_handle, parser).trim()
231                        );
232                    }
233                } else if debug {
234                    let text = get_text_content(node_handle, parser);
235                    let trimmed = text.trim();
236                    eprintln!(
237                        "[hOCR] Warning: Failed to parse bbox for ocrx_word element: {} (title: {})",
238                        if trimmed.is_empty() { "<empty>" } else { trimmed },
239                        title_str
240                    );
241                }
242            }
243        }
244
245        let children = tag.children();
246        for child_handle in children.top().iter() {
247            words.extend(extract_hocr_words(child_handle, parser, min_confidence, debug));
248        }
249    }
250
251    words
252}
253
254/// Detect column positions from word positions
255///
256/// Groups words by their x-position and returns the median x-position
257/// for each detected column.
258///
259/// Optimized with O(n log n) complexity using sorted insertion.
260pub fn detect_columns(words: &[HocrWord], column_threshold: u32) -> Vec<u32> {
261    if words.is_empty() {
262        return Vec::new();
263    }
264
265    let mut x_positions: Vec<u32> = words.iter().map(|w| w.left).collect();
266    x_positions.sort_unstable();
267
268    let mut position_groups: Vec<Vec<u32>> = Vec::new();
269    let mut current_group = vec![x_positions[0]];
270
271    for &x_pos in &x_positions[1..] {
272        let matches_group = current_group.iter().any(|&pos| x_pos.abs_diff(pos) <= column_threshold);
273
274        if matches_group {
275            current_group.push(x_pos);
276        } else {
277            position_groups.push(std::mem::replace(&mut current_group, vec![x_pos]));
278        }
279    }
280
281    if !current_group.is_empty() {
282        position_groups.push(current_group);
283    }
284
285    let mut columns: Vec<u32> = position_groups
286        .iter()
287        .map(|group| {
288            let mid = group.len() / 2;
289            group[mid]
290        })
291        .collect();
292
293    columns.sort_unstable();
294    columns
295}
296
297/// Detect row positions from word positions
298///
299/// Groups words by their vertical center position and returns the median
300/// y-position for each detected row.
301///
302/// Optimized with O(n log n) complexity using sorted insertion.
303pub fn detect_rows(words: &[HocrWord], row_threshold_ratio: f64) -> Vec<u32> {
304    if words.is_empty() {
305        return Vec::new();
306    }
307
308    let mut heights: Vec<u32> = words.iter().map(|w| w.height).collect();
309    heights.sort_unstable();
310    let median_height = heights[heights.len() / 2];
311    let row_threshold = median_height as f64 * row_threshold_ratio;
312
313    let mut y_centers: Vec<f64> = words.iter().map(|w| w.y_center()).collect();
314    y_centers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
315
316    let mut position_groups: Vec<Vec<f64>> = Vec::new();
317    let mut current_group = vec![y_centers[0]];
318
319    for &y_center in &y_centers[1..] {
320        let matches_group = current_group.iter().any(|&pos| (y_center - pos).abs() <= row_threshold);
321
322        if matches_group {
323            current_group.push(y_center);
324        } else {
325            position_groups.push(std::mem::replace(&mut current_group, vec![y_center]));
326        }
327    }
328
329    if !current_group.is_empty() {
330        position_groups.push(current_group);
331    }
332
333    let mut rows: Vec<u32> = position_groups
334        .iter()
335        .map(|group| {
336            let mid = group.len() / 2;
337            group[mid] as u32
338        })
339        .collect();
340
341    rows.sort_unstable();
342    rows
343}
344
345/// Reconstruct table structure from words
346///
347/// Takes detected words and reconstructs a 2D table by:
348/// 1. Detecting column and row positions
349/// 2. Assigning words to cells based on position
350/// 3. Combining words within the same cell
351pub fn reconstruct_table(
352    words: &[HocrWord],
353    column_threshold: u32,
354    row_threshold_ratio: f64,
355    debug: bool,
356) -> Vec<Vec<String>> {
357    if words.is_empty() {
358        if debug {
359            eprintln!("[hOCR] Warning: No words to reconstruct table from");
360        }
361        return Vec::new();
362    }
363
364    let col_positions = detect_columns(words, column_threshold);
365    let row_positions = detect_rows(words, row_threshold_ratio);
366
367    if col_positions.is_empty() || row_positions.is_empty() {
368        if debug {
369            eprintln!(
370                "[hOCR] Warning: Could not detect table structure (columns: {}, rows: {})",
371                col_positions.len(),
372                row_positions.len()
373            );
374        }
375        return Vec::new();
376    }
377
378    if debug {
379        eprintln!(
380            "[hOCR] Detected table structure: {} rows × {} columns",
381            row_positions.len(),
382            col_positions.len()
383        );
384    }
385
386    let num_rows = row_positions.len();
387    let num_cols = col_positions.len();
388    let mut table: Vec<Vec<Vec<String>>> = vec![vec![vec![]; num_cols]; num_rows];
389    let mut unassigned_words = 0;
390
391    for word in words {
392        if let (Some(r), Some(c)) = (
393            find_row_index(&row_positions, word),
394            find_column_index(&col_positions, word),
395        ) {
396            if r < num_rows && c < num_cols {
397                table[r][c].push(word.text.clone());
398            } else {
399                unassigned_words += 1;
400                if debug {
401                    eprintln!(
402                        "[hOCR] Warning: Word '{}' assigned to out-of-bounds cell ({}, {})",
403                        word.text, r, c
404                    );
405                }
406            }
407        } else {
408            unassigned_words += 1;
409            if debug {
410                eprintln!(
411                    "[hOCR] Warning: Could not assign word '{}' to any cell (position: {}, {})",
412                    word.text, word.left, word.top
413                );
414            }
415        }
416    }
417
418    if debug && unassigned_words > 0 {
419        eprintln!(
420            "[hOCR] Warning: {} out of {} words could not be assigned to table cells",
421            unassigned_words,
422            words.len()
423        );
424    }
425
426    let result: Vec<Vec<String>> = table
427        .into_iter()
428        .map(|row| {
429            row.into_iter()
430                .map(|cell_words| {
431                    if cell_words.is_empty() {
432                        String::new()
433                    } else {
434                        cell_words.join(" ")
435                    }
436                })
437                .collect()
438        })
439        .collect();
440
441    remove_empty_rows_and_columns(result)
442}
443
444/// Find which row a word belongs to based on its y-center
445fn find_row_index(row_positions: &[u32], word: &HocrWord) -> Option<usize> {
446    let y_center = word.y_center() as u32;
447
448    row_positions
449        .iter()
450        .enumerate()
451        .min_by_key(|&(_, row_y)| row_y.abs_diff(y_center))
452        .map(|(idx, _)| idx)
453}
454
455/// Find which column a word belongs to based on its x-position
456fn find_column_index(col_positions: &[u32], word: &HocrWord) -> Option<usize> {
457    let x_pos = word.left;
458
459    col_positions
460        .iter()
461        .enumerate()
462        .min_by_key(|&(_, col_x)| col_x.abs_diff(x_pos))
463        .map(|(idx, _)| idx)
464}
465
466/// Remove empty rows and columns from table
467fn remove_empty_rows_and_columns(table: Vec<Vec<String>>) -> Vec<Vec<String>> {
468    if table.is_empty() {
469        return table;
470    }
471
472    let num_cols = table[0].len();
473    let mut non_empty_cols: Vec<bool> = vec![false; num_cols];
474
475    for row in &table {
476        for (col_idx, cell) in row.iter().enumerate() {
477            if !cell.trim().is_empty() {
478                non_empty_cols[col_idx] = true;
479            }
480        }
481    }
482
483    table
484        .into_iter()
485        .filter(|row| row.iter().any(|cell| !cell.trim().is_empty()))
486        .map(|row| {
487            row.into_iter()
488                .enumerate()
489                .filter(|(idx, _)| non_empty_cols[*idx])
490                .map(|(_, cell)| cell)
491                .collect()
492        })
493        .collect()
494}
495
496/// Convert table to markdown format
497pub fn table_to_markdown(table: &[Vec<String>]) -> String {
498    if table.is_empty() {
499        return String::new();
500    }
501
502    let num_cols = table[0].len();
503    if num_cols == 0 {
504        return String::new();
505    }
506
507    let mut markdown = String::new();
508
509    for (row_idx, row) in table.iter().enumerate() {
510        markdown.push('|');
511        for cell in row {
512            markdown.push(' ');
513            markdown.push_str(&cell.replace('|', "\\|"));
514            markdown.push_str(" |");
515        }
516        markdown.push('\n');
517
518        if row_idx == 0 {
519            markdown.push('|');
520            for _ in 0..num_cols {
521                markdown.push_str(" --- |");
522            }
523            markdown.push('\n');
524        }
525    }
526
527    markdown
528}
529
530#[cfg(test)]
531mod tests {
532    use super::*;
533
534    #[test]
535    fn test_parse_bbox() {
536        assert_eq!(parse_bbox("bbox 100 50 180 80", false), Some((100, 50, 80, 30)));
537        assert_eq!(parse_bbox("bbox 0 0 100 200", false), Some((0, 0, 100, 200)));
538        assert_eq!(
539            parse_bbox("bbox 100 50 180 80; x_wconf 95", false),
540            Some((100, 50, 80, 30))
541        );
542        assert_eq!(parse_bbox("invalid", false), None);
543        assert_eq!(parse_bbox("bbox 100 50", false), None);
544    }
545
546    #[test]
547    fn test_parse_confidence() {
548        assert_eq!(parse_confidence("x_wconf 95.5"), 95.5);
549        assert_eq!(parse_confidence("bbox 100 50 180 80; x_wconf 92"), 92.0);
550        assert_eq!(parse_confidence("invalid"), 0.0);
551    }
552
553    #[test]
554    fn test_hocr_word_methods() {
555        let word = HocrWord {
556            text: "Hello".to_string(),
557            left: 100,
558            top: 50,
559            width: 80,
560            height: 30,
561            confidence: 95.5,
562        };
563
564        assert_eq!(word.right(), 180);
565        assert_eq!(word.bottom(), 80);
566        assert_eq!(word.y_center(), 65.0);
567        assert_eq!(word.x_center(), 140.0);
568    }
569
570    #[test]
571    fn test_detect_columns() {
572        let words = vec![
573            HocrWord {
574                text: "A".to_string(),
575                left: 100,
576                top: 50,
577                width: 20,
578                height: 30,
579                confidence: 95.0,
580            },
581            HocrWord {
582                text: "B".to_string(),
583                left: 200,
584                top: 50,
585                width: 20,
586                height: 30,
587                confidence: 95.0,
588            },
589            HocrWord {
590                text: "C".to_string(),
591                left: 105,
592                top: 100,
593                width: 20,
594                height: 30,
595                confidence: 95.0,
596            },
597        ];
598
599        let columns = detect_columns(&words, 50);
600        assert_eq!(columns.len(), 2);
601        assert!(columns.contains(&100) || columns.contains(&105));
602        assert!(columns.contains(&200));
603    }
604
605    #[test]
606    fn test_table_to_markdown() {
607        let table = vec![
608            vec!["Header1".to_string(), "Header2".to_string()],
609            vec!["Cell1".to_string(), "Cell2".to_string()],
610        ];
611
612        let markdown = table_to_markdown(&table);
613        assert!(markdown.contains("| Header1 | Header2 |"));
614        assert!(markdown.contains("| --- | --- |"));
615        assert!(markdown.contains("| Cell1 | Cell2 |"));
616    }
617
618    #[test]
619    fn test_table_to_markdown_escape_pipes() {
620        let table = vec![vec!["A|B".to_string(), "C".to_string()]];
621
622        let markdown = table_to_markdown(&table);
623        assert!(markdown.contains("A\\|B"));
624    }
625
626    #[test]
627    fn test_extract_hocr_words() {
628        let hocr = r#"
629            <div class="ocr_page">
630                <span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">Hello</span>
631                <span class="ocrx_word" title="bbox 160 50 210 80; x_wconf 92">World</span>
632            </div>
633        "#;
634
635        let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
636        let parser = dom.parser();
637
638        let mut words = Vec::new();
639        for child_handle in dom.children().iter() {
640            words.extend(extract_hocr_words(child_handle, parser, 0.0, false));
641        }
642
643        assert_eq!(words.len(), 2);
644        assert_eq!(words[0].text, "Hello");
645        assert_eq!(words[0].left, 100);
646        assert_eq!(words[0].confidence, 95.0);
647
648        assert_eq!(words[1].text, "World");
649        assert_eq!(words[1].left, 160);
650        assert_eq!(words[1].confidence, 92.0);
651    }
652
653    #[test]
654    fn test_extract_hocr_words_confidence_filter() {
655        let hocr = r#"
656            <div class="ocr_page">
657                <span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">HighConf</span>
658                <span class="ocrx_word" title="bbox 160 50 210 80; x_wconf 50">LowConf</span>
659                <span class="ocrx_word" title="bbox 220 50 270 80; x_wconf 98">VeryHigh</span>
660            </div>
661        "#;
662
663        let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
664        let parser = dom.parser();
665
666        let mut words = Vec::new();
667        for child_handle in dom.children().iter() {
668            words.extend(extract_hocr_words(child_handle, parser, 90.0, false));
669        }
670
671        assert_eq!(words.len(), 2);
672        assert_eq!(words[0].text, "HighConf");
673        assert_eq!(words[1].text, "VeryHigh");
674    }
675
676    #[test]
677    fn test_reconstruct_simple_table() {
678        let words = vec![
679            HocrWord {
680                text: "Name".to_string(),
681                left: 100,
682                top: 50,
683                width: 50,
684                height: 20,
685                confidence: 95.0,
686            },
687            HocrWord {
688                text: "Age".to_string(),
689                left: 200,
690                top: 50,
691                width: 50,
692                height: 20,
693                confidence: 95.0,
694            },
695            HocrWord {
696                text: "Alice".to_string(),
697                left: 100,
698                top: 100,
699                width: 50,
700                height: 20,
701                confidence: 95.0,
702            },
703            HocrWord {
704                text: "30".to_string(),
705                left: 200,
706                top: 100,
707                width: 50,
708                height: 20,
709                confidence: 95.0,
710            },
711        ];
712
713        let table = reconstruct_table(&words, 50, 0.5, false);
714
715        assert_eq!(table.len(), 2);
716        assert_eq!(table[0].len(), 2);
717        assert_eq!(table[0][0], "Name");
718        assert_eq!(table[0][1], "Age");
719        assert_eq!(table[1][0], "Alice");
720        assert_eq!(table[1][1], "30");
721    }
722
723    #[test]
724    fn test_reconstruct_table_with_multi_word_cells() {
725        let words = vec![
726            HocrWord {
727                text: "First".to_string(),
728                left: 100,
729                top: 50,
730                width: 30,
731                height: 20,
732                confidence: 95.0,
733            },
734            HocrWord {
735                text: "Name".to_string(),
736                left: 135,
737                top: 50,
738                width: 30,
739                height: 20,
740                confidence: 95.0,
741            },
742            HocrWord {
743                text: "Last".to_string(),
744                left: 200,
745                top: 50,
746                width: 30,
747                height: 20,
748                confidence: 95.0,
749            },
750            HocrWord {
751                text: "Name".to_string(),
752                left: 235,
753                top: 50,
754                width: 30,
755                height: 20,
756                confidence: 95.0,
757            },
758        ];
759
760        let table = reconstruct_table(&words, 50, 0.5, false);
761
762        assert_eq!(table.len(), 1);
763        assert_eq!(table[0].len(), 2);
764        assert_eq!(table[0][0], "First Name");
765        assert_eq!(table[0][1], "Last Name");
766    }
767
768    #[test]
769    fn test_end_to_end_hocr_table_extraction() {
770        let hocr = r#"
771            <div class="ocr_page">
772                <span class="ocrx_word" title="bbox 100 50 140 70; x_wconf 95">Product</span>
773                <span class="ocrx_word" title="bbox 200 50 240 70; x_wconf 95">Price</span>
774                <span class="ocrx_word" title="bbox 100 100 140 120; x_wconf 95">Apple</span>
775                <span class="ocrx_word" title="bbox 200 100 240 120; x_wconf 95">$1.50</span>
776                <span class="ocrx_word" title="bbox 100 150 140 170; x_wconf 95">Orange</span>
777                <span class="ocrx_word" title="bbox 200 150 240 170; x_wconf 95">$2.00</span>
778            </div>
779        "#;
780
781        let dom = tl::parse(hocr, tl::ParserOptions::default()).unwrap();
782        let parser = dom.parser();
783
784        let mut words = Vec::new();
785        for child_handle in dom.children().iter() {
786            words.extend(extract_hocr_words(child_handle, parser, 0.0, false));
787        }
788
789        let table = reconstruct_table(&words, 50, 0.5, false);
790        let markdown = table_to_markdown(&table);
791
792        assert_eq!(table.len(), 3);
793        assert_eq!(table[0][0], "Product");
794        assert_eq!(table[0][1], "Price");
795        assert_eq!(table[1][0], "Apple");
796        assert_eq!(table[1][1], "$1.50");
797        assert_eq!(table[2][0], "Orange");
798        assert_eq!(table[2][1], "$2.00");
799
800        assert!(markdown.contains("| Product | Price |"));
801        assert!(markdown.contains("| Apple | $1.50 |"));
802        assert!(markdown.contains("| Orange | $2.00 |"));
803    }
804}
html_to_markdown_rs/hocr.rs

html_to_markdown_rs/
hocr.rs