hocr_parser/
hocr.rs

1use roxmltree::{Node, ParsingOptions};
2
3use crate::error::Result;
4use crate::{element::Element, element::ElementBorrowed, HOCRParserError};
5
6/// Represents a hOCR file, borrowing its contents from the XML string.
7#[derive(Debug, Clone)]
8#[cfg_attr(feature = "serde", derive(serde::Serialize))]
9pub struct HOCRBorrowed<'input> {
10    #[cfg_attr(feature = "serde", serde(skip))]
11    pub document: &'input roxmltree::Document<'input>,
12    pub system: &'input str,
13    pub capabilities: Vec<&'input str>,
14    pub number_of_pages: Option<u32>,
15    pub langs: Option<Vec<&'input str>>,
16    pub scripts: Option<Vec<&'input str>>,
17    pub elements: Vec<ElementBorrowed<'input>>,
18}
19
20impl<'input> HOCRBorrowed<'input> {
21    /// Create a new [`HOCRBorrowed`] instance from a [`roxmltree::Document`].
22    pub fn new_from_document(document: &'input roxmltree::Document<'input>) -> Result<Self> {
23        let head = document
24            .root_element()
25            .children()
26            .find(|e| e.tag_name().name() == "head")
27            .ok_or(HOCRParserError::NoHeadElement)?;
28
29        let metadata: Vec<_> = head
30            .children()
31            .filter_map(|e| {
32                if e.tag_name().name() == "meta" && e.has_attribute("name") {
33                    let name = e.attribute("name")?;
34                    let content = e.attribute("content")?;
35                    Some((name, content))
36                } else {
37                    None
38                }
39            })
40            .collect();
41
42        let system = metadata
43            .iter()
44            .find(|(name, _)| *name == "ocr-system")
45            .map(|(_, content)| content)
46            .ok_or(HOCRParserError::NoOCRSystem)?;
47
48        let capabilities = metadata
49            .iter()
50            .find(|(name, _)| *name == "ocr-capabilities")
51            .map(|(_, content)| content.split_whitespace().collect())
52            .ok_or(HOCRParserError::NoOCRCapabilities)?;
53
54        let number_of_pages = metadata
55            .iter()
56            .find(|(name, _)| *name == "ocr-number-of-pages")
57            .map(|(_, content)| content.parse().ok())
58            .flatten();
59
60        let langs = metadata
61            .iter()
62            .find(|(name, _)| *name == "ocr-langs")
63            .map(|(_, content)| content.split_whitespace().collect());
64
65        let scripts = metadata
66            .iter()
67            .find(|(name, _)| *name == "scripts")
68            .map(|(_, content)| content.split_whitespace().collect());
69
70        let body = document
71            .root_element()
72            .children()
73            .find(|e| e.tag_name().name() == "body")
74            .ok_or(HOCRParserError::NoBodyElement)?;
75
76        let elements: Vec<_> = body
77            .children()
78            .filter(Node::is_element)
79            .map(ElementBorrowed::from_node)
80            .collect();
81
82        if let Some(e) = elements.iter().find(|r| r.is_err()) {
83            return Err(e.as_ref().unwrap_err().clone());
84        }
85
86        Ok(Self {
87            document: document,
88            system,
89            capabilities,
90            number_of_pages,
91            langs,
92            scripts,
93            elements: elements.into_iter().map(Result::unwrap).collect(),
94        })
95    }
96}
97
98/// Represents a hOCR file.
99#[derive(Debug, Clone)]
100#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
101pub struct HOCR {
102    pub system: String,
103    pub capabilities: Vec<String>,
104    pub number_of_pages: Option<u32>,
105    pub langs: Option<Vec<String>>,
106    pub scripts: Option<Vec<String>>,
107    pub elements: Vec<Element>,
108}
109
110impl HOCR {
111    /// Create a new [`HOCR`] instance from a string containing hOCR XML.
112    pub fn from_str(xml_str: &str) -> Result<Self> {
113        let mut options = ParsingOptions::default();
114        options.allow_dtd = true;
115
116        let doc = roxmltree::Document::parse_with_options(&xml_str, options)?;
117
118        let hocr = HOCRBorrowed::new_from_document(&doc)?;
119        Ok(Self::from_hocr_borrowed(hocr))
120    }
121
122    /// Create a new [`HOCR`] instance from a [`HOCRBorrowed`].
123    pub fn from_hocr_borrowed(hocr: HOCRBorrowed) -> Self {
124        Self {
125            system: hocr.system.to_string(),
126            capabilities: hocr.capabilities.iter().map(|s| s.to_string()).collect(),
127            number_of_pages: hocr.number_of_pages,
128            langs: hocr
129                .langs
130                .map(|l| l.iter().map(|s| s.to_string()).collect()),
131            scripts: hocr
132                .scripts
133                .map(|s| s.iter().map(|s| s.to_string()).collect()),
134            elements: hocr
135                .elements
136                .iter()
137                .map(Element::from_element_borrowed)
138                .collect(),
139        }
140    }
141}