hocr_parser/
lib.rs

1//! # Overview
2//! 
3//! A parser for the [hOCR](http://kba.github.io/hocr-spec/1.2/) format, "an open standard for representing document layout analysis and OCR results as a subset of HTML."
4//! 
5//! ## Design 
6//! 
7//! This parser uses [`roxmltree`] to parse the XHTML. It simplifies provides easy access to the hOCR data embedded through the [`HOCR`] and [`Element`] structs, as well as their "borrowed" counterparts ([`HOCRBorrowed`], [`ElementBorrowed`]) to prevent allocating for property names.
8//! 
9//! The parser does not validate if the file adheres to the hOCR specification. It checks required metadata and validity of hOCR element and property names but does not check property values.
10
11mod element;
12mod error;
13mod iter;
14mod hocr;
15mod parsing;
16/// Contains the element and property names defined in the hOCR specification.
17pub mod spec_definitions;
18
19pub use error::{HOCRParserError, Result};
20pub use hocr::{HOCR, HOCRBorrowed};
21pub use element::{Element, ElementBorrowed};
22
23pub use roxmltree;
24
25#[cfg(test)]
26mod tests {
27    use super::*;
28
29    #[test]
30    fn parse_quoted_properties() {
31        let property = "image \"Screenshot 2024-05-12 at 14.21.17.png\"; bbox 0 0 796 1314; ppageno 0; scan_res 144 144";
32        let res = parsing::parse_properties(&property);
33        println!("{:?}", res);
34        let image_prop = res.iter().find(|(n, _)| *n == "image").unwrap();
35        assert_eq!(image_prop.1, vec!["Screenshot 2024-05-12 at 14.21.17.png"]);
36    }
37
38    #[test]
39    fn parse_multiple_quoted_properties() {
40        let property = r#"x_source abc def "/gfs/cc/clean/012345678911" "17" abc def "Screenshot 2024-05-12 at 14.21.17.png""#;
41        let res = parsing::parse_properties(&property);
42        println!("{:?}", res);
43        let prop = res.iter().find(|(n, _)| *n == "x_source").unwrap();
44        assert_eq!(
45            prop.1,
46            vec![
47                "abc",
48                "def",
49                "/gfs/cc/clean/012345678911",
50                "17",
51                "abc",
52                "def",
53                "Screenshot 2024-05-12 at 14.21.17.png"
54            ]
55        );
56    }
57
58    #[test]
59    fn parse_empty_property() {
60        let property = "";
61        let res = parsing::parse_properties(&property);
62        assert_eq!(res.len(), 0);
63    }
64
65    #[test]
66    fn parse_just_whitespace_property() {
67        let property = "     \n  \t  \n  \t  \n";
68        let res = parsing::parse_properties(&property);
69        assert_eq!(res.len(), 0);
70    }
71}