html_to_markdown_rs/hocr/
parser.rs1use super::types::{BBox, Baseline, HocrProperties};
6use crate::text::decode_html_entities;
7
8pub fn parse_properties(title: &str, debug: bool) -> HocrProperties {
10 let mut props = HocrProperties::default();
11
12 let title = decode_html_entities(title);
13
14 for part in title.split(';') {
15 let part = part.trim();
16 if part.is_empty() {
17 continue;
18 }
19
20 let mut tokens = part.split_whitespace();
21 if let Some(key) = tokens.next() {
22 match key {
23 "bbox" => {
24 if let Some(bbox) = parse_bbox_coords(&mut tokens) {
25 props.bbox = Some(bbox);
26 }
27 }
28 "baseline" => {
29 if let Some(baseline) = parse_baseline(&mut tokens) {
30 props.baseline = Some(baseline);
31 }
32 }
33 "textangle" => {
34 if let Some(angle_str) = tokens.next() {
35 if let Ok(angle) = angle_str.parse::<f64>() {
36 props.textangle = Some(angle);
37 }
38 }
39 }
40 "poly" => {
41 props.poly = parse_poly(&mut tokens);
42 }
43 "x_wconf" => {
44 if let Some(conf_str) = tokens.next() {
45 if let Ok(conf) = conf_str.parse::<f64>() {
46 props.x_wconf = Some(conf);
47 }
48 }
49 }
50 "x_confs" => {
51 props.x_confs = parse_float_list(&mut tokens);
52 }
53 "nlp" => {
54 props.nlp = parse_float_list(&mut tokens);
55 }
56 "x_font" => {
57 if let Some(font) = parse_quoted_string(part) {
58 props.x_font = Some(font);
59 }
60 }
61 "x_fsize" => {
62 if let Some(size_str) = tokens.next() {
63 if let Ok(size) = size_str.parse::<u32>() {
64 props.x_fsize = Some(size);
65 }
66 }
67 }
68 "order" => {
69 if let Some(order_str) = tokens.next() {
70 if let Ok(order) = order_str.parse::<u32>() {
71 props.order = Some(order);
72 }
73 }
74 }
75 "cflow" => {
76 if let Some(flow) = parse_quoted_string(part) {
77 props.cflow = Some(flow);
78 }
79 }
80 "hardbreak" => {
81 if let Some(val) = tokens.next() {
82 props.hardbreak = val == "1";
83 }
84 }
85 "cuts" => {
86 props.cuts = parse_cuts(&mut tokens);
87 }
88 "x_bboxes" => {
89 props.x_bboxes = parse_bboxes_list(&mut tokens);
90 }
91 "image" => {
92 if let Some(img) = parse_quoted_string(part) {
93 props.image = Some(img);
94 }
95 }
96 "imagemd5" => {
97 if let Some(md5) = parse_quoted_string(part) {
98 props.imagemd5 = Some(md5);
99 }
100 }
101 "ppageno" => {
102 if let Some(page_str) = tokens.next() {
103 if let Ok(page) = page_str.parse::<u32>() {
104 props.ppageno = Some(page);
105 }
106 }
107 }
108 "lpageno" => {
109 let rest: Vec<&str> = tokens.collect();
110 if !rest.is_empty() {
111 let lpageno_str = rest.join(" ");
112 if let Some(quoted) = parse_quoted_string(part) {
113 props.lpageno = Some(quoted);
114 } else {
115 props.lpageno = Some(lpageno_str);
116 }
117 }
118 }
119 "scan_res" => {
120 let coords: Vec<&str> = tokens.collect();
121 if coords.len() >= 2 {
122 if let (Ok(x), Ok(y)) = (coords[0].parse::<u32>(), coords[1].parse::<u32>()) {
123 props.scan_res = Some((x, y));
124 }
125 }
126 }
127 "x_source" => {
128 let sources = parse_all_quoted_strings(part);
129 if !sources.is_empty() {
130 props.x_source = sources;
131 }
132 }
133 "x_scanner" => {
134 if let Some(scanner) = parse_quoted_string(part) {
135 props.x_scanner = Some(scanner);
136 }
137 }
138 "x_size" | "x_descenders" | "x_ascenders" => {
139 let value: Vec<&str> = tokens.collect();
140 if !value.is_empty() {
141 props.other.insert(key.to_string(), value.join(" "));
142 }
143 }
144 _ => {
145 if debug {
146 eprintln!("[hOCR] Unknown property: {}", key);
147 }
148 let value: Vec<&str> = tokens.collect();
149 if !value.is_empty() {
150 props.other.insert(key.to_string(), value.join(" "));
151 }
152 }
153 }
154 }
155 }
156
157 props
158}
159
160fn parse_bbox_coords<'a, I>(tokens: &mut I) -> Option<BBox>
161where
162 I: Iterator<Item = &'a str>,
163{
164 let coords: Vec<&str> = tokens.take(4).collect();
165 if coords.len() == 4 {
166 if let (Ok(x1), Ok(y1), Ok(x2), Ok(y2)) = (
167 coords[0].parse::<u32>(),
168 coords[1].parse::<u32>(),
169 coords[2].parse::<u32>(),
170 coords[3].parse::<u32>(),
171 ) {
172 return Some(BBox { x1, y1, x2, y2 });
173 }
174 }
175 None
176}
177
178fn parse_baseline<'a, I>(tokens: &mut I) -> Option<Baseline>
179where
180 I: Iterator<Item = &'a str>,
181{
182 let vals: Vec<&str> = tokens.take(2).collect();
183 if vals.len() == 2 {
184 if let (Ok(slope), Ok(constant)) = (vals[0].parse::<f64>(), vals[1].parse::<i32>()) {
185 return Some(Baseline { slope, constant });
186 }
187 }
188 None
189}
190
191fn parse_poly<'a, I>(tokens: &mut I) -> Option<Vec<(i32, i32)>>
192where
193 I: Iterator<Item = &'a str>,
194{
195 let coords: Vec<&str> = tokens.collect();
196 if coords.len() >= 4 && coords.len() % 2 == 0 {
197 let mut points = Vec::new();
198 for chunk in coords.chunks(2) {
199 if let (Ok(x), Ok(y)) = (chunk[0].parse::<i32>(), chunk[1].parse::<i32>()) {
200 points.push((x, y));
201 } else {
202 return None;
203 }
204 }
205 return Some(points);
206 }
207 None
208}
209
210fn parse_float_list<'a, I>(tokens: &mut I) -> Vec<f64>
211where
212 I: Iterator<Item = &'a str>,
213{
214 tokens.filter_map(|s| s.parse::<f64>().ok()).collect()
215}
216
217fn parse_cuts<'a, I>(tokens: &mut I) -> Vec<Vec<u32>>
218where
219 I: Iterator<Item = &'a str>,
220{
221 let mut cuts = Vec::new();
222 for token in tokens {
223 if token.contains(',') {
224 let parts: Vec<u32> = token.split(',').filter_map(|s| s.parse::<u32>().ok()).collect();
225 cuts.push(parts);
226 } else if let Ok(val) = token.parse::<u32>() {
227 cuts.push(vec![val]);
228 }
229 }
230 cuts
231}
232
233fn parse_bboxes_list<'a, I>(tokens: &mut I) -> Vec<BBox>
234where
235 I: Iterator<Item = &'a str>,
236{
237 let coords: Vec<u32> = tokens.filter_map(|s| s.parse::<u32>().ok()).collect();
238
239 coords
240 .chunks(4)
241 .filter_map(|chunk| {
242 if chunk.len() == 4 {
243 Some(BBox {
244 x1: chunk[0],
245 y1: chunk[1],
246 x2: chunk[2],
247 y2: chunk[3],
248 })
249 } else {
250 None
251 }
252 })
253 .collect()
254}
255
256fn parse_quoted_string(s: &str) -> Option<String> {
257 if let Some(start) = s.find('"') {
258 if let Some(end) = s[start + 1..].find('"') {
259 return Some(s[start + 1..start + 1 + end].to_string());
260 }
261 }
262 None
263}
264
265fn parse_all_quoted_strings(s: &str) -> Vec<String> {
266 let mut results = Vec::new();
267 let mut remaining = s;
268
269 while let Some(start) = remaining.find('"') {
270 if let Some(end) = remaining[start + 1..].find('"') {
271 results.push(remaining[start + 1..start + 1 + end].to_string());
272 remaining = &remaining[start + 1 + end + 1..];
273 } else {
274 break;
275 }
276 }
277
278 results
279}
280
281#[cfg(test)]
282mod tests {
283 use super::*;
284
285 #[test]
286 fn test_parse_bbox() {
287 let props = parse_properties("bbox 100 50 200 150", false);
288 assert_eq!(
289 props.bbox,
290 Some(BBox {
291 x1: 100,
292 y1: 50,
293 x2: 200,
294 y2: 150
295 })
296 );
297 }
298
299 #[test]
300 fn test_parse_baseline() {
301 let props = parse_properties("baseline 0.015 -18", false);
302 assert_eq!(
303 props.baseline,
304 Some(Baseline {
305 slope: 0.015,
306 constant: -18
307 })
308 );
309 }
310
311 #[test]
312 fn test_parse_multiple_properties() {
313 let props = parse_properties("bbox 0 0 100 50; x_wconf 95.5; textangle 7.2", false);
314 assert_eq!(
315 props.bbox,
316 Some(BBox {
317 x1: 0,
318 y1: 0,
319 x2: 100,
320 y2: 50
321 })
322 );
323 assert_eq!(props.x_wconf, Some(95.5));
324 assert_eq!(props.textangle, Some(7.2));
325 }
326
327 #[test]
328 fn test_parse_quoted_strings() {
329 let props = parse_properties("x_font \"Comic Sans MS\"; x_fsize 12", false);
330 assert_eq!(props.x_font, Some("Comic Sans MS".to_string()));
331 assert_eq!(props.x_fsize, Some(12));
332 }
333
334 #[test]
335 fn test_parse_poly() {
336 let props = parse_properties("poly 0 0 0 10 10 10 10 0", false);
337 assert_eq!(props.poly, Some(vec![(0, 0), (0, 10), (10, 10), (10, 0)]));
338 }
339
340 #[test]
341 fn test_parse_x_confs() {
342 let props = parse_properties("x_confs 37.3 51.23 100", false);
343 assert_eq!(props.x_confs, vec![37.3, 51.23, 100.0]);
344 }
345}