html_to_markdown_rs/hocr/
parser.rs

1//! hOCR property parser
2//!
3//! Parses hOCR title attributes into structured properties.
4
5use super::types::{BBox, Baseline, HocrProperties};
6use crate::text::decode_html_entities;
7
8/// Parse all properties from hOCR title attribute
9pub fn parse_properties(title: &str, debug: bool) -> HocrProperties {
10    let mut props = HocrProperties::default();
11
12    // Decode HTML entities first
13    let title = decode_html_entities(title);
14
15    for part in title.split(';') {
16        let part = part.trim();
17        if part.is_empty() {
18            continue;
19        }
20
21        let mut tokens = part.split_whitespace();
22        if let Some(key) = tokens.next() {
23            match key {
24                "bbox" => {
25                    if let Some(bbox) = parse_bbox_coords(&mut tokens) {
26                        props.bbox = Some(bbox);
27                    }
28                }
29                "baseline" => {
30                    if let Some(baseline) = parse_baseline(&mut tokens) {
31                        props.baseline = Some(baseline);
32                    }
33                }
34                "textangle" => {
35                    if let Some(angle_str) = tokens.next() {
36                        if let Ok(angle) = angle_str.parse::<f64>() {
37                            props.textangle = Some(angle);
38                        }
39                    }
40                }
41                "poly" => {
42                    props.poly = parse_poly(&mut tokens);
43                }
44                "x_wconf" => {
45                    if let Some(conf_str) = tokens.next() {
46                        if let Ok(conf) = conf_str.parse::<f64>() {
47                            props.x_wconf = Some(conf);
48                        }
49                    }
50                }
51                "x_confs" => {
52                    props.x_confs = parse_float_list(&mut tokens);
53                }
54                "nlp" => {
55                    props.nlp = parse_float_list(&mut tokens);
56                }
57                "x_font" => {
58                    if let Some(font) = parse_quoted_string(part) {
59                        props.x_font = Some(font);
60                    }
61                }
62                "x_fsize" => {
63                    if let Some(size_str) = tokens.next() {
64                        if let Ok(size) = size_str.parse::<u32>() {
65                            props.x_fsize = Some(size);
66                        }
67                    }
68                }
69                "order" => {
70                    if let Some(order_str) = tokens.next() {
71                        if let Ok(order) = order_str.parse::<u32>() {
72                            props.order = Some(order);
73                        }
74                    }
75                }
76                "cflow" => {
77                    if let Some(flow) = parse_quoted_string(part) {
78                        props.cflow = Some(flow);
79                    }
80                }
81                "hardbreak" => {
82                    if let Some(val) = tokens.next() {
83                        props.hardbreak = val == "1";
84                    }
85                }
86                "cuts" => {
87                    props.cuts = parse_cuts(&mut tokens);
88                }
89                "x_bboxes" => {
90                    props.x_bboxes = parse_bboxes_list(&mut tokens);
91                }
92                "image" => {
93                    if let Some(img) = parse_quoted_string(part) {
94                        props.image = Some(img);
95                    }
96                }
97                "imagemd5" => {
98                    if let Some(md5) = parse_quoted_string(part) {
99                        props.imagemd5 = Some(md5);
100                    }
101                }
102                "ppageno" => {
103                    if let Some(page_str) = tokens.next() {
104                        if let Ok(page) = page_str.parse::<u32>() {
105                            props.ppageno = Some(page);
106                        }
107                    }
108                }
109                "lpageno" => {
110                    let rest: Vec<&str> = tokens.collect();
111                    if !rest.is_empty() {
112                        let lpageno_str = rest.join(" ");
113                        // Could be quoted or just a value
114                        if let Some(quoted) = parse_quoted_string(part) {
115                            props.lpageno = Some(quoted);
116                        } else {
117                            props.lpageno = Some(lpageno_str);
118                        }
119                    }
120                }
121                "scan_res" => {
122                    let coords: Vec<&str> = tokens.collect();
123                    if coords.len() >= 2 {
124                        if let (Ok(x), Ok(y)) = (coords[0].parse::<u32>(), coords[1].parse::<u32>()) {
125                            props.scan_res = Some((x, y));
126                        }
127                    }
128                }
129                "x_source" => {
130                    // Can be multiple quoted strings
131                    let sources = parse_all_quoted_strings(part);
132                    if !sources.is_empty() {
133                        props.x_source = sources;
134                    }
135                }
136                "x_scanner" => {
137                    if let Some(scanner) = parse_quoted_string(part) {
138                        props.x_scanner = Some(scanner);
139                    }
140                }
141                "x_size" | "x_descenders" | "x_ascenders" => {
142                    // Known but not yet fully supported - store in other
143                    let value: Vec<&str> = tokens.collect();
144                    if !value.is_empty() {
145                        props.other.insert(key.to_string(), value.join(" "));
146                    }
147                }
148                _ => {
149                    if debug {
150                        eprintln!("[hOCR] Unknown property: {}", key);
151                    }
152                    // Store unknown properties
153                    let value: Vec<&str> = tokens.collect();
154                    if !value.is_empty() {
155                        props.other.insert(key.to_string(), value.join(" "));
156                    }
157                }
158            }
159        }
160    }
161
162    props
163}
164
165fn parse_bbox_coords<'a, I>(tokens: &mut I) -> Option<BBox>
166where
167    I: Iterator<Item = &'a str>,
168{
169    let coords: Vec<&str> = tokens.take(4).collect();
170    if coords.len() == 4 {
171        if let (Ok(x1), Ok(y1), Ok(x2), Ok(y2)) = (
172            coords[0].parse::<u32>(),
173            coords[1].parse::<u32>(),
174            coords[2].parse::<u32>(),
175            coords[3].parse::<u32>(),
176        ) {
177            return Some(BBox { x1, y1, x2, y2 });
178        }
179    }
180    None
181}
182
183fn parse_baseline<'a, I>(tokens: &mut I) -> Option<Baseline>
184where
185    I: Iterator<Item = &'a str>,
186{
187    let vals: Vec<&str> = tokens.take(2).collect();
188    if vals.len() == 2 {
189        if let (Ok(slope), Ok(constant)) = (vals[0].parse::<f64>(), vals[1].parse::<i32>()) {
190            return Some(Baseline { slope, constant });
191        }
192    }
193    None
194}
195
196fn parse_poly<'a, I>(tokens: &mut I) -> Option<Vec<(i32, i32)>>
197where
198    I: Iterator<Item = &'a str>,
199{
200    let coords: Vec<&str> = tokens.collect();
201    if coords.len() >= 4 && coords.len() % 2 == 0 {
202        let mut points = Vec::new();
203        for chunk in coords.chunks(2) {
204            if let (Ok(x), Ok(y)) = (chunk[0].parse::<i32>(), chunk[1].parse::<i32>()) {
205                points.push((x, y));
206            } else {
207                return None;
208            }
209        }
210        return Some(points);
211    }
212    None
213}
214
215fn parse_float_list<'a, I>(tokens: &mut I) -> Vec<f64>
216where
217    I: Iterator<Item = &'a str>,
218{
219    tokens.filter_map(|s| s.parse::<f64>().ok()).collect()
220}
221
222fn parse_cuts<'a, I>(tokens: &mut I) -> Vec<Vec<u32>>
223where
224    I: Iterator<Item = &'a str>,
225{
226    let mut cuts = Vec::new();
227    for token in tokens {
228        if token.contains(',') {
229            // Complex cut with offsets
230            let parts: Vec<u32> = token.split(',').filter_map(|s| s.parse::<u32>().ok()).collect();
231            cuts.push(parts);
232        } else if let Ok(val) = token.parse::<u32>() {
233            cuts.push(vec![val]);
234        }
235    }
236    cuts
237}
238
239fn parse_bboxes_list<'a, I>(tokens: &mut I) -> Vec<BBox>
240where
241    I: Iterator<Item = &'a str>,
242{
243    let coords: Vec<u32> = tokens.filter_map(|s| s.parse::<u32>().ok()).collect();
244
245    coords
246        .chunks(4)
247        .filter_map(|chunk| {
248            if chunk.len() == 4 {
249                Some(BBox {
250                    x1: chunk[0],
251                    y1: chunk[1],
252                    x2: chunk[2],
253                    y2: chunk[3],
254                })
255            } else {
256                None
257            }
258        })
259        .collect()
260}
261
262fn parse_quoted_string(s: &str) -> Option<String> {
263    // Find first occurrence of a quoted string
264    if let Some(start) = s.find('"') {
265        if let Some(end) = s[start + 1..].find('"') {
266            return Some(s[start + 1..start + 1 + end].to_string());
267        }
268    }
269    None
270}
271
272fn parse_all_quoted_strings(s: &str) -> Vec<String> {
273    let mut results = Vec::new();
274    let mut remaining = s;
275
276    while let Some(start) = remaining.find('"') {
277        if let Some(end) = remaining[start + 1..].find('"') {
278            results.push(remaining[start + 1..start + 1 + end].to_string());
279            remaining = &remaining[start + 1 + end + 1..];
280        } else {
281            break;
282        }
283    }
284
285    results
286}
287
288#[cfg(test)]
289mod tests {
290    use super::*;
291
292    #[test]
293    fn test_parse_bbox() {
294        let props = parse_properties("bbox 100 50 200 150", false);
295        assert_eq!(
296            props.bbox,
297            Some(BBox {
298                x1: 100,
299                y1: 50,
300                x2: 200,
301                y2: 150
302            })
303        );
304    }
305
306    #[test]
307    fn test_parse_baseline() {
308        let props = parse_properties("baseline 0.015 -18", false);
309        assert_eq!(
310            props.baseline,
311            Some(Baseline {
312                slope: 0.015,
313                constant: -18
314            })
315        );
316    }
317
318    #[test]
319    fn test_parse_multiple_properties() {
320        let props = parse_properties("bbox 0 0 100 50; x_wconf 95.5; textangle 7.2", false);
321        assert_eq!(
322            props.bbox,
323            Some(BBox {
324                x1: 0,
325                y1: 0,
326                x2: 100,
327                y2: 50
328            })
329        );
330        assert_eq!(props.x_wconf, Some(95.5));
331        assert_eq!(props.textangle, Some(7.2));
332    }
333
334    #[test]
335    fn test_parse_quoted_strings() {
336        let props = parse_properties("x_font \"Comic Sans MS\"; x_fsize 12", false);
337        assert_eq!(props.x_font, Some("Comic Sans MS".to_string()));
338        assert_eq!(props.x_fsize, Some(12));
339    }
340
341    #[test]
342    fn test_parse_poly() {
343        let props = parse_properties("poly 0 0 0 10 10 10 10 0", false);
344        assert_eq!(props.poly, Some(vec![(0, 0), (0, 10), (10, 10), (10, 0)]));
345    }
346
347    #[test]
348    fn test_parse_x_confs() {
349        let props = parse_properties("x_confs 37.3 51.23 100", false);
350        assert_eq!(props.x_confs, vec![37.3, 51.23, 100.0]);
351    }
352}