html_to_markdown_rs/hocr/
parser.rs

1//! hOCR property parser
2//!
3//! Parses hOCR title attributes into structured properties.
4
5use super::types::{BBox, Baseline, HocrProperties};
6use crate::text::decode_html_entities;
7
8/// Parse all properties from hOCR title attribute
9pub fn parse_properties(title: &str, debug: bool) -> HocrProperties {
10    let mut props = HocrProperties::default();
11
12    let title = decode_html_entities(title);
13
14    for part in title.split(';') {
15        let part = part.trim();
16        if part.is_empty() {
17            continue;
18        }
19
20        let mut tokens = part.split_whitespace();
21        if let Some(key) = tokens.next() {
22            match key {
23                "bbox" => {
24                    if let Some(bbox) = parse_bbox_coords(&mut tokens) {
25                        props.bbox = Some(bbox);
26                    }
27                }
28                "baseline" => {
29                    if let Some(baseline) = parse_baseline(&mut tokens) {
30                        props.baseline = Some(baseline);
31                    }
32                }
33                "textangle" => {
34                    if let Some(angle_str) = tokens.next() {
35                        if let Ok(angle) = angle_str.parse::<f64>() {
36                            props.textangle = Some(angle);
37                        }
38                    }
39                }
40                "poly" => {
41                    props.poly = parse_poly(&mut tokens);
42                }
43                "x_wconf" => {
44                    if let Some(conf_str) = tokens.next() {
45                        if let Ok(conf) = conf_str.parse::<f64>() {
46                            props.x_wconf = Some(conf);
47                        }
48                    }
49                }
50                "x_confs" => {
51                    props.x_confs = parse_float_list(&mut tokens);
52                }
53                "nlp" => {
54                    props.nlp = parse_float_list(&mut tokens);
55                }
56                "x_font" => {
57                    if let Some(font) = parse_quoted_string(part) {
58                        props.x_font = Some(font);
59                    }
60                }
61                "x_fsize" => {
62                    if let Some(size_str) = tokens.next() {
63                        if let Ok(size) = size_str.parse::<u32>() {
64                            props.x_fsize = Some(size);
65                        }
66                    }
67                }
68                "order" => {
69                    if let Some(order_str) = tokens.next() {
70                        if let Ok(order) = order_str.parse::<u32>() {
71                            props.order = Some(order);
72                        }
73                    }
74                }
75                "cflow" => {
76                    if let Some(flow) = parse_quoted_string(part) {
77                        props.cflow = Some(flow);
78                    }
79                }
80                "hardbreak" => {
81                    if let Some(val) = tokens.next() {
82                        props.hardbreak = val == "1";
83                    }
84                }
85                "cuts" => {
86                    props.cuts = parse_cuts(&mut tokens);
87                }
88                "x_bboxes" => {
89                    props.x_bboxes = parse_bboxes_list(&mut tokens);
90                }
91                "image" => {
92                    if let Some(img) = parse_quoted_string(part) {
93                        props.image = Some(img);
94                    }
95                }
96                "imagemd5" => {
97                    if let Some(md5) = parse_quoted_string(part) {
98                        props.imagemd5 = Some(md5);
99                    }
100                }
101                "ppageno" => {
102                    if let Some(page_str) = tokens.next() {
103                        if let Ok(page) = page_str.parse::<u32>() {
104                            props.ppageno = Some(page);
105                        }
106                    }
107                }
108                "lpageno" => {
109                    let rest: Vec<&str> = tokens.collect();
110                    if !rest.is_empty() {
111                        let lpageno_str = rest.join(" ");
112                        if let Some(quoted) = parse_quoted_string(part) {
113                            props.lpageno = Some(quoted);
114                        } else {
115                            props.lpageno = Some(lpageno_str);
116                        }
117                    }
118                }
119                "scan_res" => {
120                    let coords: Vec<&str> = tokens.collect();
121                    if coords.len() >= 2 {
122                        if let (Ok(x), Ok(y)) = (coords[0].parse::<u32>(), coords[1].parse::<u32>()) {
123                            props.scan_res = Some((x, y));
124                        }
125                    }
126                }
127                "x_source" => {
128                    let sources = parse_all_quoted_strings(part);
129                    if !sources.is_empty() {
130                        props.x_source = sources;
131                    }
132                }
133                "x_scanner" => {
134                    if let Some(scanner) = parse_quoted_string(part) {
135                        props.x_scanner = Some(scanner);
136                    }
137                }
138                "x_size" | "x_descenders" | "x_ascenders" => {
139                    let value: Vec<&str> = tokens.collect();
140                    if !value.is_empty() {
141                        props.other.insert(key.to_string(), value.join(" "));
142                    }
143                }
144                _ => {
145                    if debug {
146                        eprintln!("[hOCR] Unknown property: {}", key);
147                    }
148                    let value: Vec<&str> = tokens.collect();
149                    if !value.is_empty() {
150                        props.other.insert(key.to_string(), value.join(" "));
151                    }
152                }
153            }
154        }
155    }
156
157    props
158}
159
160fn parse_bbox_coords<'a, I>(tokens: &mut I) -> Option<BBox>
161where
162    I: Iterator<Item = &'a str>,
163{
164    let coords: Vec<&str> = tokens.take(4).collect();
165    if coords.len() == 4 {
166        if let (Ok(x1), Ok(y1), Ok(x2), Ok(y2)) = (
167            coords[0].parse::<u32>(),
168            coords[1].parse::<u32>(),
169            coords[2].parse::<u32>(),
170            coords[3].parse::<u32>(),
171        ) {
172            return Some(BBox { x1, y1, x2, y2 });
173        }
174    }
175    None
176}
177
178fn parse_baseline<'a, I>(tokens: &mut I) -> Option<Baseline>
179where
180    I: Iterator<Item = &'a str>,
181{
182    let vals: Vec<&str> = tokens.take(2).collect();
183    if vals.len() == 2 {
184        if let (Ok(slope), Ok(constant)) = (vals[0].parse::<f64>(), vals[1].parse::<i32>()) {
185            return Some(Baseline { slope, constant });
186        }
187    }
188    None
189}
190
191fn parse_poly<'a, I>(tokens: &mut I) -> Option<Vec<(i32, i32)>>
192where
193    I: Iterator<Item = &'a str>,
194{
195    let coords: Vec<&str> = tokens.collect();
196    if coords.len() >= 4 && coords.len() % 2 == 0 {
197        let mut points = Vec::new();
198        for chunk in coords.chunks(2) {
199            if let (Ok(x), Ok(y)) = (chunk[0].parse::<i32>(), chunk[1].parse::<i32>()) {
200                points.push((x, y));
201            } else {
202                return None;
203            }
204        }
205        return Some(points);
206    }
207    None
208}
209
210fn parse_float_list<'a, I>(tokens: &mut I) -> Vec<f64>
211where
212    I: Iterator<Item = &'a str>,
213{
214    tokens.filter_map(|s| s.parse::<f64>().ok()).collect()
215}
216
217fn parse_cuts<'a, I>(tokens: &mut I) -> Vec<Vec<u32>>
218where
219    I: Iterator<Item = &'a str>,
220{
221    let mut cuts = Vec::new();
222    for token in tokens {
223        if token.contains(',') {
224            let parts: Vec<u32> = token.split(',').filter_map(|s| s.parse::<u32>().ok()).collect();
225            cuts.push(parts);
226        } else if let Ok(val) = token.parse::<u32>() {
227            cuts.push(vec![val]);
228        }
229    }
230    cuts
231}
232
233fn parse_bboxes_list<'a, I>(tokens: &mut I) -> Vec<BBox>
234where
235    I: Iterator<Item = &'a str>,
236{
237    let coords: Vec<u32> = tokens.filter_map(|s| s.parse::<u32>().ok()).collect();
238
239    coords
240        .chunks(4)
241        .filter_map(|chunk| {
242            if chunk.len() == 4 {
243                Some(BBox {
244                    x1: chunk[0],
245                    y1: chunk[1],
246                    x2: chunk[2],
247                    y2: chunk[3],
248                })
249            } else {
250                None
251            }
252        })
253        .collect()
254}
255
256fn parse_quoted_string(s: &str) -> Option<String> {
257    if let Some(start) = s.find('"') {
258        if let Some(end) = s[start + 1..].find('"') {
259            return Some(s[start + 1..start + 1 + end].to_string());
260        }
261    }
262    None
263}
264
265fn parse_all_quoted_strings(s: &str) -> Vec<String> {
266    let mut results = Vec::new();
267    let mut remaining = s;
268
269    while let Some(start) = remaining.find('"') {
270        if let Some(end) = remaining[start + 1..].find('"') {
271            results.push(remaining[start + 1..start + 1 + end].to_string());
272            remaining = &remaining[start + 1 + end + 1..];
273        } else {
274            break;
275        }
276    }
277
278    results
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    #[test]
286    fn test_parse_bbox() {
287        let props = parse_properties("bbox 100 50 200 150", false);
288        assert_eq!(
289            props.bbox,
290            Some(BBox {
291                x1: 100,
292                y1: 50,
293                x2: 200,
294                y2: 150
295            })
296        );
297    }
298
299    #[test]
300    fn test_parse_baseline() {
301        let props = parse_properties("baseline 0.015 -18", false);
302        assert_eq!(
303            props.baseline,
304            Some(Baseline {
305                slope: 0.015,
306                constant: -18
307            })
308        );
309    }
310
311    #[test]
312    fn test_parse_multiple_properties() {
313        let props = parse_properties("bbox 0 0 100 50; x_wconf 95.5; textangle 7.2", false);
314        assert_eq!(
315            props.bbox,
316            Some(BBox {
317                x1: 0,
318                y1: 0,
319                x2: 100,
320                y2: 50
321            })
322        );
323        assert_eq!(props.x_wconf, Some(95.5));
324        assert_eq!(props.textangle, Some(7.2));
325    }
326
327    #[test]
328    fn test_parse_quoted_strings() {
329        let props = parse_properties("x_font \"Comic Sans MS\"; x_fsize 12", false);
330        assert_eq!(props.x_font, Some("Comic Sans MS".to_string()));
331        assert_eq!(props.x_fsize, Some(12));
332    }
333
334    #[test]
335    fn test_parse_poly() {
336        let props = parse_properties("poly 0 0 0 10 10 10 10 0", false);
337        assert_eq!(props.poly, Some(vec![(0, 0), (0, 10), (10, 10), (10, 0)]));
338    }
339
340    #[test]
341    fn test_parse_x_confs() {
342        let props = parse_properties("x_confs 37.3 51.23 100", false);
343        assert_eq!(props.x_confs, vec![37.3, 51.23, 100.0]);
344    }
345}