1use roxmltree::Node;
2
3use crate::{parsing::{check_property_name, parse_properties}, spec_definitions::HOCR_ELEMENTS, HOCRParserError, Result};
4
5#[derive(Debug, Clone)]
7#[cfg_attr(feature = "serde", derive(serde::Serialize))]
8pub struct ElementBorrowed<'a> {
9 #[cfg_attr(feature = "serde", serde(skip))]
10 pub node: roxmltree::Node<'a, 'a>,
11 pub element_type: &'a str,
12 pub properties: Vec<(&'a str, Vec<&'a str>)>,
13 pub lang: Option<&'a str>,
14 pub text: Option<&'a str>,
15 pub children: Vec<ElementBorrowed<'a>>,
16}
17
18impl<'a> ElementBorrowed<'a> {
19 pub fn from_node(n: Node<'a, 'a>) -> Result<Self> {
21 if !n.is_element() {
22 let pos = n.document().text_pos_at(n.range().start);
23 return Err(HOCRParserError::NodeIsNotElement(pos));
24 }
25
26 let element_type = n.attribute("class").unwrap_or("");
27
28 if !(HOCR_ELEMENTS.contains(&element_type) || element_type.starts_with("ocrx_")) {
30 let pos = n.document().text_pos_at(n.range().start);
31 return Err(HOCRParserError::UnknownElement(pos));
32 }
33
34 let prop = n.attribute("title").unwrap_or("");
35 let properties = parse_properties(prop);
36
37 for (name, _) in &properties {
38 if check_property_name(&name) {
39 let pos = n.document().text_pos_at(n.range().start);
40 return Err(HOCRParserError::UnknownProperty(pos));
41 }
42 }
43
44 let lang = n.attribute("lang");
45 let text = n.text().unwrap_or("");
46
47 let processsed_text = {
49 let all_whitespace = text.chars().all(char::is_whitespace);
50
51 if all_whitespace {
52 None
53 } else {
54 Some(text)
55 }
56 };
57
58 let children: Vec<_> = n
59 .children()
60 .filter(Node::is_element)
61 .map(ElementBorrowed::from_node)
62 .collect();
63
64 if let Some(e) = children.iter().find(|r| r.is_err()) {
65 return Err(e.as_ref().unwrap_err().clone());
66 }
67
68 Ok(Self {
69 node: n,
70 element_type,
71 properties,
72 lang,
73 text: processsed_text,
74 children: children.into_iter().map(Result::unwrap).collect(),
75 })
76 }
77}
78
79#[derive(Debug, Clone)]
81#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
82pub struct Element {
83 pub element_type: String,
84 pub properties: Vec<(String, Vec<String>)>,
85 pub lang: Option<String>,
86 pub text: Option<String>,
87 pub children: Vec<Element>,
88}
89
90impl Element {
91 pub fn from_element_borrowed(e: &ElementBorrowed) -> Self {
93 Self {
94 element_type: e.element_type.to_string(),
95 properties: e
96 .properties
97 .iter()
98 .map(|(k, v)| (k.to_string(), v.iter().map(|s| s.to_string()).collect()))
99 .collect(),
100 lang: e.lang.map(|l| l.to_string()),
101 text: e.text.map(|t| t.to_string()),
102 children: e
103 .children
104 .iter()
105 .map(Element::from_element_borrowed)
106 .collect(),
107 }
108 }
109
110 pub fn from_node(n: Node) -> Result<Self> {
112 let e = ElementBorrowed::from_node(n)?;
113 Ok(Self::from_element_borrowed(&e))
114 }
115}