1use roxmltree::{Node, ParsingOptions};
2
3use crate::error::Result;
4use crate::{element::Element, element::ElementBorrowed, HOCRParserError};
5
6#[derive(Debug, Clone)]
8#[cfg_attr(feature = "serde", derive(serde::Serialize))]
9pub struct HOCRBorrowed<'input> {
10 #[cfg_attr(feature = "serde", serde(skip))]
11 pub document: &'input roxmltree::Document<'input>,
12 pub system: &'input str,
13 pub capabilities: Vec<&'input str>,
14 pub number_of_pages: Option<u32>,
15 pub langs: Option<Vec<&'input str>>,
16 pub scripts: Option<Vec<&'input str>>,
17 pub elements: Vec<ElementBorrowed<'input>>,
18}
19
20impl<'input> HOCRBorrowed<'input> {
21 pub fn new_from_document(document: &'input roxmltree::Document<'input>) -> Result<Self> {
23 let head = document
24 .root_element()
25 .children()
26 .find(|e| e.tag_name().name() == "head")
27 .ok_or(HOCRParserError::NoHeadElement)?;
28
29 let metadata: Vec<_> = head
30 .children()
31 .filter_map(|e| {
32 if e.tag_name().name() == "meta" && e.has_attribute("name") {
33 let name = e.attribute("name")?;
34 let content = e.attribute("content")?;
35 Some((name, content))
36 } else {
37 None
38 }
39 })
40 .collect();
41
42 let system = metadata
43 .iter()
44 .find(|(name, _)| *name == "ocr-system")
45 .map(|(_, content)| content)
46 .ok_or(HOCRParserError::NoOCRSystem)?;
47
48 let capabilities = metadata
49 .iter()
50 .find(|(name, _)| *name == "ocr-capabilities")
51 .map(|(_, content)| content.split_whitespace().collect())
52 .ok_or(HOCRParserError::NoOCRCapabilities)?;
53
54 let number_of_pages = metadata
55 .iter()
56 .find(|(name, _)| *name == "ocr-number-of-pages")
57 .map(|(_, content)| content.parse().ok())
58 .flatten();
59
60 let langs = metadata
61 .iter()
62 .find(|(name, _)| *name == "ocr-langs")
63 .map(|(_, content)| content.split_whitespace().collect());
64
65 let scripts = metadata
66 .iter()
67 .find(|(name, _)| *name == "scripts")
68 .map(|(_, content)| content.split_whitespace().collect());
69
70 let body = document
71 .root_element()
72 .children()
73 .find(|e| e.tag_name().name() == "body")
74 .ok_or(HOCRParserError::NoBodyElement)?;
75
76 let elements: Vec<_> = body
77 .children()
78 .filter(Node::is_element)
79 .map(ElementBorrowed::from_node)
80 .collect();
81
82 if let Some(e) = elements.iter().find(|r| r.is_err()) {
83 return Err(e.as_ref().unwrap_err().clone());
84 }
85
86 Ok(Self {
87 document: document,
88 system,
89 capabilities,
90 number_of_pages,
91 langs,
92 scripts,
93 elements: elements.into_iter().map(Result::unwrap).collect(),
94 })
95 }
96}
97
98#[derive(Debug, Clone)]
100#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
101pub struct HOCR {
102 pub system: String,
103 pub capabilities: Vec<String>,
104 pub number_of_pages: Option<u32>,
105 pub langs: Option<Vec<String>>,
106 pub scripts: Option<Vec<String>>,
107 pub elements: Vec<Element>,
108}
109
110impl HOCR {
111 pub fn from_str(xml_str: &str) -> Result<Self> {
113 let mut options = ParsingOptions::default();
114 options.allow_dtd = true;
115
116 let doc = roxmltree::Document::parse_with_options(&xml_str, options)?;
117
118 let hocr = HOCRBorrowed::new_from_document(&doc)?;
119 Ok(Self::from_hocr_borrowed(hocr))
120 }
121
122 pub fn from_hocr_borrowed(hocr: HOCRBorrowed) -> Self {
124 Self {
125 system: hocr.system.to_string(),
126 capabilities: hocr.capabilities.iter().map(|s| s.to_string()).collect(),
127 number_of_pages: hocr.number_of_pages,
128 langs: hocr
129 .langs
130 .map(|l| l.iter().map(|s| s.to_string()).collect()),
131 scripts: hocr
132 .scripts
133 .map(|s| s.iter().map(|s| s.to_string()).collect()),
134 elements: hocr
135 .elements
136 .iter()
137 .map(Element::from_element_borrowed)
138 .collect(),
139 }
140 }
141}