html_to_markdown_rs/hocr/
parser.rs1use super::types::{BBox, Baseline, HocrProperties};
6use crate::text::decode_html_entities;
7
8pub fn parse_properties(title: &str, debug: bool) -> HocrProperties {
10 let mut props = HocrProperties::default();
11
12 let title = decode_html_entities(title);
14
15 for part in title.split(';') {
16 let part = part.trim();
17 if part.is_empty() {
18 continue;
19 }
20
21 let mut tokens = part.split_whitespace();
22 if let Some(key) = tokens.next() {
23 match key {
24 "bbox" => {
25 if let Some(bbox) = parse_bbox_coords(&mut tokens) {
26 props.bbox = Some(bbox);
27 }
28 }
29 "baseline" => {
30 if let Some(baseline) = parse_baseline(&mut tokens) {
31 props.baseline = Some(baseline);
32 }
33 }
34 "textangle" => {
35 if let Some(angle_str) = tokens.next() {
36 if let Ok(angle) = angle_str.parse::<f64>() {
37 props.textangle = Some(angle);
38 }
39 }
40 }
41 "poly" => {
42 props.poly = parse_poly(&mut tokens);
43 }
44 "x_wconf" => {
45 if let Some(conf_str) = tokens.next() {
46 if let Ok(conf) = conf_str.parse::<f64>() {
47 props.x_wconf = Some(conf);
48 }
49 }
50 }
51 "x_confs" => {
52 props.x_confs = parse_float_list(&mut tokens);
53 }
54 "nlp" => {
55 props.nlp = parse_float_list(&mut tokens);
56 }
57 "x_font" => {
58 if let Some(font) = parse_quoted_string(part) {
59 props.x_font = Some(font);
60 }
61 }
62 "x_fsize" => {
63 if let Some(size_str) = tokens.next() {
64 if let Ok(size) = size_str.parse::<u32>() {
65 props.x_fsize = Some(size);
66 }
67 }
68 }
69 "order" => {
70 if let Some(order_str) = tokens.next() {
71 if let Ok(order) = order_str.parse::<u32>() {
72 props.order = Some(order);
73 }
74 }
75 }
76 "cflow" => {
77 if let Some(flow) = parse_quoted_string(part) {
78 props.cflow = Some(flow);
79 }
80 }
81 "hardbreak" => {
82 if let Some(val) = tokens.next() {
83 props.hardbreak = val == "1";
84 }
85 }
86 "cuts" => {
87 props.cuts = parse_cuts(&mut tokens);
88 }
89 "x_bboxes" => {
90 props.x_bboxes = parse_bboxes_list(&mut tokens);
91 }
92 "image" => {
93 if let Some(img) = parse_quoted_string(part) {
94 props.image = Some(img);
95 }
96 }
97 "imagemd5" => {
98 if let Some(md5) = parse_quoted_string(part) {
99 props.imagemd5 = Some(md5);
100 }
101 }
102 "ppageno" => {
103 if let Some(page_str) = tokens.next() {
104 if let Ok(page) = page_str.parse::<u32>() {
105 props.ppageno = Some(page);
106 }
107 }
108 }
109 "lpageno" => {
110 let rest: Vec<&str> = tokens.collect();
111 if !rest.is_empty() {
112 let lpageno_str = rest.join(" ");
113 if let Some(quoted) = parse_quoted_string(part) {
115 props.lpageno = Some(quoted);
116 } else {
117 props.lpageno = Some(lpageno_str);
118 }
119 }
120 }
121 "scan_res" => {
122 let coords: Vec<&str> = tokens.collect();
123 if coords.len() >= 2 {
124 if let (Ok(x), Ok(y)) = (coords[0].parse::<u32>(), coords[1].parse::<u32>()) {
125 props.scan_res = Some((x, y));
126 }
127 }
128 }
129 "x_source" => {
130 let sources = parse_all_quoted_strings(part);
132 if !sources.is_empty() {
133 props.x_source = sources;
134 }
135 }
136 "x_scanner" => {
137 if let Some(scanner) = parse_quoted_string(part) {
138 props.x_scanner = Some(scanner);
139 }
140 }
141 "x_size" | "x_descenders" | "x_ascenders" => {
142 let value: Vec<&str> = tokens.collect();
144 if !value.is_empty() {
145 props.other.insert(key.to_string(), value.join(" "));
146 }
147 }
148 _ => {
149 if debug {
150 eprintln!("[hOCR] Unknown property: {}", key);
151 }
152 let value: Vec<&str> = tokens.collect();
154 if !value.is_empty() {
155 props.other.insert(key.to_string(), value.join(" "));
156 }
157 }
158 }
159 }
160 }
161
162 props
163}
164
165fn parse_bbox_coords<'a, I>(tokens: &mut I) -> Option<BBox>
166where
167 I: Iterator<Item = &'a str>,
168{
169 let coords: Vec<&str> = tokens.take(4).collect();
170 if coords.len() == 4 {
171 if let (Ok(x1), Ok(y1), Ok(x2), Ok(y2)) = (
172 coords[0].parse::<u32>(),
173 coords[1].parse::<u32>(),
174 coords[2].parse::<u32>(),
175 coords[3].parse::<u32>(),
176 ) {
177 return Some(BBox { x1, y1, x2, y2 });
178 }
179 }
180 None
181}
182
183fn parse_baseline<'a, I>(tokens: &mut I) -> Option<Baseline>
184where
185 I: Iterator<Item = &'a str>,
186{
187 let vals: Vec<&str> = tokens.take(2).collect();
188 if vals.len() == 2 {
189 if let (Ok(slope), Ok(constant)) = (vals[0].parse::<f64>(), vals[1].parse::<i32>()) {
190 return Some(Baseline { slope, constant });
191 }
192 }
193 None
194}
195
196fn parse_poly<'a, I>(tokens: &mut I) -> Option<Vec<(i32, i32)>>
197where
198 I: Iterator<Item = &'a str>,
199{
200 let coords: Vec<&str> = tokens.collect();
201 if coords.len() >= 4 && coords.len() % 2 == 0 {
202 let mut points = Vec::new();
203 for chunk in coords.chunks(2) {
204 if let (Ok(x), Ok(y)) = (chunk[0].parse::<i32>(), chunk[1].parse::<i32>()) {
205 points.push((x, y));
206 } else {
207 return None;
208 }
209 }
210 return Some(points);
211 }
212 None
213}
214
215fn parse_float_list<'a, I>(tokens: &mut I) -> Vec<f64>
216where
217 I: Iterator<Item = &'a str>,
218{
219 tokens.filter_map(|s| s.parse::<f64>().ok()).collect()
220}
221
222fn parse_cuts<'a, I>(tokens: &mut I) -> Vec<Vec<u32>>
223where
224 I: Iterator<Item = &'a str>,
225{
226 let mut cuts = Vec::new();
227 for token in tokens {
228 if token.contains(',') {
229 let parts: Vec<u32> = token.split(',').filter_map(|s| s.parse::<u32>().ok()).collect();
231 cuts.push(parts);
232 } else if let Ok(val) = token.parse::<u32>() {
233 cuts.push(vec![val]);
234 }
235 }
236 cuts
237}
238
239fn parse_bboxes_list<'a, I>(tokens: &mut I) -> Vec<BBox>
240where
241 I: Iterator<Item = &'a str>,
242{
243 let coords: Vec<u32> = tokens.filter_map(|s| s.parse::<u32>().ok()).collect();
244
245 coords
246 .chunks(4)
247 .filter_map(|chunk| {
248 if chunk.len() == 4 {
249 Some(BBox {
250 x1: chunk[0],
251 y1: chunk[1],
252 x2: chunk[2],
253 y2: chunk[3],
254 })
255 } else {
256 None
257 }
258 })
259 .collect()
260}
261
262fn parse_quoted_string(s: &str) -> Option<String> {
263 if let Some(start) = s.find('"') {
265 if let Some(end) = s[start + 1..].find('"') {
266 return Some(s[start + 1..start + 1 + end].to_string());
267 }
268 }
269 None
270}
271
272fn parse_all_quoted_strings(s: &str) -> Vec<String> {
273 let mut results = Vec::new();
274 let mut remaining = s;
275
276 while let Some(start) = remaining.find('"') {
277 if let Some(end) = remaining[start + 1..].find('"') {
278 results.push(remaining[start + 1..start + 1 + end].to_string());
279 remaining = &remaining[start + 1 + end + 1..];
280 } else {
281 break;
282 }
283 }
284
285 results
286}
287
288#[cfg(test)]
289mod tests {
290 use super::*;
291
292 #[test]
293 fn test_parse_bbox() {
294 let props = parse_properties("bbox 100 50 200 150", false);
295 assert_eq!(
296 props.bbox,
297 Some(BBox {
298 x1: 100,
299 y1: 50,
300 x2: 200,
301 y2: 150
302 })
303 );
304 }
305
306 #[test]
307 fn test_parse_baseline() {
308 let props = parse_properties("baseline 0.015 -18", false);
309 assert_eq!(
310 props.baseline,
311 Some(Baseline {
312 slope: 0.015,
313 constant: -18
314 })
315 );
316 }
317
318 #[test]
319 fn test_parse_multiple_properties() {
320 let props = parse_properties("bbox 0 0 100 50; x_wconf 95.5; textangle 7.2", false);
321 assert_eq!(
322 props.bbox,
323 Some(BBox {
324 x1: 0,
325 y1: 0,
326 x2: 100,
327 y2: 50
328 })
329 );
330 assert_eq!(props.x_wconf, Some(95.5));
331 assert_eq!(props.textangle, Some(7.2));
332 }
333
334 #[test]
335 fn test_parse_quoted_strings() {
336 let props = parse_properties("x_font \"Comic Sans MS\"; x_fsize 12", false);
337 assert_eq!(props.x_font, Some("Comic Sans MS".to_string()));
338 assert_eq!(props.x_fsize, Some(12));
339 }
340
341 #[test]
342 fn test_parse_poly() {
343 let props = parse_properties("poly 0 0 0 10 10 10 10 0", false);
344 assert_eq!(props.poly, Some(vec![(0, 0), (0, 10), (10, 10), (10, 0)]));
345 }
346
347 #[test]
348 fn test_parse_x_confs() {
349 let props = parse_properties("x_confs 37.3 51.23 100", false);
350 assert_eq!(props.x_confs, vec![37.3, 51.23, 100.0]);
351 }
352}