hocr_parser/
element.rs

1use roxmltree::Node;
2
3use crate::{parsing::{check_property_name, parse_properties}, spec_definitions::HOCR_ELEMENTS, HOCRParserError, Result};
4
5/// Represents an hOCR element, borrowing its contents from the XML string.
6#[derive(Debug, Clone)]
7#[cfg_attr(feature = "serde", derive(serde::Serialize))]
8pub struct ElementBorrowed<'a> {
9    #[cfg_attr(feature = "serde", serde(skip))]
10    pub node: roxmltree::Node<'a, 'a>,
11    pub element_type: &'a str,
12    pub properties: Vec<(&'a str, Vec<&'a str>)>,
13    pub lang: Option<&'a str>,
14    pub text: Option<&'a str>,
15    pub children: Vec<ElementBorrowed<'a>>,
16}
17
18impl<'a> ElementBorrowed<'a> {
19    /// Create a new [`ElementBorrowed`] instance from an [`roxmltree::Node`].
20    pub fn from_node(n: Node<'a, 'a>) -> Result<Self> {
21        if !n.is_element() {
22            let pos = n.document().text_pos_at(n.range().start);
23            return Err(HOCRParserError::NodeIsNotElement(pos));
24        }
25
26        let element_type = n.attribute("class").unwrap_or("");
27
28        // check if defined in spec or whether it is implementation specific
29        if !(HOCR_ELEMENTS.contains(&element_type) || element_type.starts_with("ocrx_")) {
30            let pos = n.document().text_pos_at(n.range().start);
31            return Err(HOCRParserError::UnknownElement(pos));
32        }
33
34        let prop = n.attribute("title").unwrap_or("");
35        let properties = parse_properties(prop);
36
37        for (name, _) in &properties {
38            if check_property_name(&name) {
39                let pos = n.document().text_pos_at(n.range().start);
40                return Err(HOCRParserError::UnknownProperty(pos));
41            }
42        }
43
44        let lang = n.attribute("lang");
45        let text = n.text().unwrap_or("");
46
47        // prevent empty lines of whitespace
48        let processsed_text = {
49            let all_whitespace = text.chars().all(char::is_whitespace);
50
51            if all_whitespace {
52                None
53            } else {
54                Some(text)
55            }
56        };
57
58        let children: Vec<_> = n
59            .children()
60            .filter(Node::is_element)
61            .map(ElementBorrowed::from_node)
62            .collect();
63
64        if let Some(e) = children.iter().find(|r| r.is_err()) {
65            return Err(e.as_ref().unwrap_err().clone());
66        }
67
68        Ok(Self {
69            node: n,
70            element_type,
71            properties,
72            lang,
73            text: processsed_text,
74            children: children.into_iter().map(Result::unwrap).collect(),
75        })
76    }
77}
78
79/// Represents an hOCR element.
80#[derive(Debug, Clone)]
81#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
82pub struct Element {
83    pub element_type: String,
84    pub properties: Vec<(String, Vec<String>)>,
85    pub lang: Option<String>,
86    pub text: Option<String>,
87    pub children: Vec<Element>,
88}
89
90impl Element {
91    /// Create a new [`Element`] instance from an [`ElementBorrowed`].
92    pub fn from_element_borrowed(e: &ElementBorrowed) -> Self {
93        Self {
94            element_type: e.element_type.to_string(),
95            properties: e
96                .properties
97                .iter()
98                .map(|(k, v)| (k.to_string(), v.iter().map(|s| s.to_string()).collect()))
99                .collect(),
100            lang: e.lang.map(|l| l.to_string()),
101            text: e.text.map(|t| t.to_string()),
102            children: e
103                .children
104                .iter()
105                .map(Element::from_element_borrowed)
106                .collect(),
107        }
108    }
109
110    /// Create a new [`Element`] instance from an [`roxmltree::Node`].
111    pub fn from_node(n: Node) -> Result<Self> {
112        let e = ElementBorrowed::from_node(n)?;
113        Ok(Self::from_element_borrowed(&e))
114    }
115}