1use std::cmp::Ordering;
2use std::collections::HashMap;
3
4use serde_json::{Map, Value};
5
6use crate::engine::ExtractionEngine;
7use crate::error::Result;
8use crate::ir::{
9 BBox, Block, Confidence, Document, FigureBlock, Metadata, Page, SourceAnchor, TableBlock,
10 TableCell, TextBlock, SCHEMA_VERSION,
11};
12use crate::source::Source;
13use crate::textual::html_to_text;
14
15const EXTRACTION_METHOD: &str = "json_native";
16
17#[derive(Debug, Default, Clone, Copy)]
18pub struct JsonEngine;
19
20impl ExtractionEngine for JsonEngine {
21 fn name(&self) -> &'static str {
22 "json-native"
23 }
24
25 fn extract(&self, source: &Source) -> Result<Document> {
26 let pages = parse_json_pages(&source.content)?;
27 Ok(build_document(source, self.name(), pages))
28 }
29}
30
31#[derive(Debug)]
32struct TextRecord {
33 kind: String,
34 text: String,
35}
36
37fn parse_json_pages(content: &str) -> Result<Vec<Page>> {
38 match serde_json::from_str::<Value>(content) {
39 Ok(value) => Ok(pages_from_json_value(&value)),
40 Err(json_error) => {
41 let mut pages = Vec::new();
42 for (index, line) in content.lines().enumerate() {
43 let trimmed = line.trim();
44 if trimmed.is_empty() {
45 continue;
46 }
47 let value = serde_json::from_str::<Value>(trimmed)?;
48 let mut value_pages = pages_from_json_value(&value);
49 renumber_pages(&mut value_pages, index + 1);
50 pages.extend(value_pages);
51 }
52 if pages.is_empty() {
53 Err(json_error.into())
54 } else {
55 Ok(pages)
56 }
57 }
58 }
59}
60
61fn pages_from_json_value(value: &Value) -> Vec<Page> {
62 if let Some(pages) = omnidocbench_pages(value) {
63 return pages;
64 }
65 if let Some(page) = funsd_page(value) {
66 return vec![page];
67 }
68 if let Some(pages) = coco_pages(value) {
69 return pages;
70 }
71 if let Some(page) = pubtabnet_page(value, 1) {
72 return vec![page];
73 }
74 if let Some(page) = word_boxes_page(value, 1) {
75 return vec![page];
76 }
77 if let Some(page) = grid_cells_page(value, 1) {
78 return vec![page];
79 }
80
81 match value {
82 Value::Array(items) => items
83 .iter()
84 .enumerate()
85 .map(|(index, item)| generic_page_from_value(item, index + 1))
86 .collect(),
87 Value::Object(object) => {
88 if let Some(Value::Array(pages)) = object.get("pages") {
89 return pages
90 .iter()
91 .enumerate()
92 .map(|(index, item)| generic_page_from_value(item, index + 1))
93 .collect();
94 }
95 vec![generic_page_from_value(value, 1)]
96 }
97 _ => vec![generic_page_from_value(value, 1)],
98 }
99}
100
101fn grid_cells_page(value: &Value, page_number: usize) -> Option<Page> {
102 let object = value.as_object()?;
103 let cell_rows = object.get("cells")?.as_array()?;
104 let mut rows = Vec::new();
105 let mut table_cells = Vec::new();
106
107 for (row_index, row) in cell_rows.iter().enumerate() {
108 let Some(row_cells) = row.as_array() else {
109 continue;
110 };
111 let mut text_row = Vec::new();
112 for (column_index, cell) in row_cells.iter().enumerate() {
113 let text = pubtabnet_cell_text(Some(cell));
114 text_row.push(text.clone());
115 table_cells.push(TableCell {
116 row: row_index,
117 column: column_index,
118 text,
119 bbox: cell.get("bbox").and_then(bbox_from_rect),
120 is_header: row_index == 0,
121 col_span: 1,
122 row_span: 1,
123 });
124 }
125 if !text_row.is_empty() {
126 rows.push(text_row);
127 }
128 }
129
130 if rows.is_empty() {
131 return None;
132 }
133 let bbox = object
134 .get("table_bbox")
135 .and_then(bbox_from_rect)
136 .or_else(|| inferred_table_cell_bbox(&table_cells));
137 let (headers, rows) = split_table_rows(rows);
138
139 Some(Page {
140 number: page_number,
141 width: bbox.map(|bbox| bbox.x + bbox.width),
142 height: bbox.map(|bbox| bbox.y + bbox.height),
143 rotation: None,
144 route: None,
145 bbox: bbox.map(|bbox| BBox {
146 x: 0.0,
147 y: 0.0,
148 width: bbox.x + bbox.width,
149 height: bbox.y + bbox.height,
150 }),
151 blocks: vec![Block::Table(TableBlock {
152 headers,
153 rows,
154 caption: None,
155 bbox,
156 cells: table_cells,
157 source_anchors: vec![source_anchor(page_number, bbox)],
158 confidence: Some(confidence()), ..Default::default()
159 })],
160 images: Vec::new(),
161 assets: Vec::new(),
162 warnings: Vec::new(),
163 })
164}
165
166fn inferred_table_cell_bbox(cells: &[TableCell]) -> Option<BBox> {
167 let mut min_x = f32::INFINITY;
168 let mut min_y = f32::INFINITY;
169 let mut max_x = f32::NEG_INFINITY;
170 let mut max_y = f32::NEG_INFINITY;
171 let mut has_bbox = false;
172 for cell in cells {
173 let Some(bbox) = cell.bbox else {
174 continue;
175 };
176 has_bbox = true;
177 min_x = min_x.min(bbox.x);
178 min_y = min_y.min(bbox.y);
179 max_x = max_x.max(bbox.x + bbox.width);
180 max_y = max_y.max(bbox.y + bbox.height);
181 }
182 has_bbox.then_some(BBox {
183 x: min_x,
184 y: min_y,
185 width: max_x - min_x,
186 height: max_y - min_y,
187 })
188}
189
190fn renumber_pages(pages: &mut [Page], first_page_number: usize) {
191 for (offset, page) in pages.iter_mut().enumerate() {
192 let page_number = first_page_number + offset;
193 page.number = page_number;
194 for block in &mut page.blocks {
195 match block {
196 Block::Text(text) => {
197 for anchor in &mut text.source_anchors {
198 anchor.page_number = page_number;
199 }
200 }
201 Block::Table(table) => {
202 for anchor in &mut table.source_anchors {
203 anchor.page_number = page_number;
204 }
205 }
206 Block::Figure(figure) => {
207 for anchor in &mut figure.source_anchors {
208 anchor.page_number = page_number;
209 }
210 }
211 }
212 }
213 }
214}
215
216fn pubtabnet_page(value: &Value, page_number: usize) -> Option<Page> {
217 let object = value.as_object()?;
218 let html = object.get("html")?.as_object()?;
219 let structure = html
220 .get("structure")
221 .and_then(Value::as_object)?
222 .get("tokens")
223 .and_then(Value::as_array)?;
224 let cells = html.get("cells").or_else(|| html.get("cell"))?.as_array()?;
225 let rows = pubtabnet_rows(structure, cells);
226 if rows.is_empty() {
227 return None;
228 }
229 let table_cells = pubtabnet_table_cells(&rows);
230 let bbox = inferred_table_cell_bbox(&table_cells);
231 let (headers, rows) = split_table_rows(
232 rows.iter()
233 .map(|row| row.cells.iter().map(|cell| cell.text.clone()).collect())
234 .collect(),
235 );
236
237 Some(Page {
238 number: page_number,
239 width: bbox.map(|bbox| bbox.x + bbox.width),
240 height: bbox.map(|bbox| bbox.y + bbox.height),
241 rotation: None,
242 route: None,
243 bbox: bbox.map(|bbox| BBox {
244 x: 0.0,
245 y: 0.0,
246 width: bbox.x + bbox.width,
247 height: bbox.y + bbox.height,
248 }),
249 blocks: vec![Block::Table(TableBlock {
250 headers,
251 rows,
252 caption: None,
253 bbox,
254 cells: table_cells,
255 source_anchors: vec![source_anchor(page_number, bbox)],
256 confidence: Some(confidence()), ..Default::default()
257 })],
258 images: Vec::new(),
259 assets: Vec::new(),
260 warnings: Vec::new(),
261 })
262}
263
264#[derive(Debug)]
265struct PubTabNetRow {
266 cells: Vec<PubTabNetCell>,
267}
268
269#[derive(Debug)]
270struct PubTabNetCell {
271 text: String,
272 bbox: Option<BBox>,
273}
274
275fn pubtabnet_rows(structure: &[Value], cells: &[Value]) -> Vec<PubTabNetRow> {
276 let mut rows = Vec::new();
277 let mut current_row: Option<PubTabNetRow> = None;
278 let mut cell_index = 0usize;
279
280 for token in structure.iter().filter_map(Value::as_str) {
281 let normalized = token.trim().to_ascii_lowercase();
282 if normalized.starts_with("<tr") && !normalized.starts_with("</") {
283 current_row = Some(PubTabNetRow { cells: Vec::new() });
284 } else if normalized.starts_with("</tr") {
285 if let Some(row) = current_row.take() {
286 if !row.cells.is_empty() {
287 rows.push(row);
288 }
289 }
290 } else if is_pubtabnet_cell_open(&normalized) {
291 let Some(row) = current_row.as_mut() else {
292 continue;
293 };
294 row.cells.push(pubtabnet_cell(cells.get(cell_index)));
295 cell_index += 1;
296 }
297 }
298
299 rows
300}
301
302fn pubtabnet_table_cells(rows: &[PubTabNetRow]) -> Vec<TableCell> {
303 rows.iter()
304 .enumerate()
305 .flat_map(|(row_index, row)| {
306 row.cells
307 .iter()
308 .enumerate()
309 .map(move |(column_index, cell)| TableCell {
310 row: row_index,
311 column: column_index,
312 text: cell.text.clone(),
313 bbox: cell.bbox,
314 is_header: row_index == 0,
315 col_span: 1,
316 row_span: 1,
317 })
318 })
319 .collect()
320}
321
322fn is_pubtabnet_cell_open(token: &str) -> bool {
323 (token.starts_with("<td") || token.starts_with("<th")) && !token.starts_with("</")
324}
325
326fn pubtabnet_cell_text(cell: Option<&Value>) -> String {
327 let Some(cell) = cell.and_then(Value::as_object) else {
328 return String::new();
329 };
330 let text = cell
331 .get("tokens")
332 .and_then(Value::as_array)
333 .map(|tokens| {
334 tokens
335 .iter()
336 .filter_map(Value::as_str)
337 .collect::<Vec<_>>()
338 .join("")
339 })
340 .or_else(|| cell.get("text").and_then(Value::as_str).map(str::to_owned))
341 .unwrap_or_default();
342 clean_text(&html_to_text(&text))
343}
344
345fn pubtabnet_cell(cell: Option<&Value>) -> PubTabNetCell {
346 PubTabNetCell {
347 text: pubtabnet_cell_text(cell),
348 bbox: cell
349 .and_then(Value::as_object)
350 .and_then(|cell| cell.get("bbox"))
351 .and_then(bbox_from_rect),
352 }
353}
354
355fn word_boxes_page(value: &Value, page_number: usize) -> Option<Page> {
356 let object = value.as_object()?;
357 let words = object.get("words")?.as_array()?;
358 let mut blocks = words
359 .iter()
360 .filter_map(|word| word.as_object())
361 .filter_map(|word| word_box_block(word, page_number))
362 .collect::<Vec<_>>();
363 if blocks.is_empty() {
364 return None;
365 }
366 blocks.sort_by(|left, right| {
367 let left_bbox = block_bbox(left);
368 let right_bbox = block_bbox(right);
369 match (left_bbox, right_bbox) {
370 (Some(left), Some(right)) => left
371 .y
372 .partial_cmp(&right.y)
373 .unwrap_or(Ordering::Equal)
374 .then_with(|| left.x.partial_cmp(&right.x).unwrap_or(Ordering::Equal)),
375 _ => Ordering::Equal,
376 }
377 });
378
379 let width = first_numeric_field(object, &["image_width", "page_width", "width"]);
380 let height = first_numeric_field(object, &["image_height", "page_height", "height"]);
381 let bbox = page_bbox(width, height).or_else(|| inferred_page_bbox(&blocks));
382
383 Some(Page {
384 number: page_number,
385 width: width.or_else(|| bbox.map(|bbox| bbox.width)),
386 height: height.or_else(|| bbox.map(|bbox| bbox.height)),
387 rotation: None,
388 bbox,
389 blocks,
390 images: Vec::new(),
391 assets: Vec::new(),
392 warnings: Vec::new(), ..Default::default()
393 })
394}
395
396fn word_box_block(word: &Map<String, Value>, page_number: usize) -> Option<Block> {
397 let text = first_string_field(word, &["text", "word", "value"]).map(clean_text)?;
398 if text.is_empty() {
399 return None;
400 }
401 let bbox = first_bbox_field(
402 word,
403 &[
404 "bbox",
405 "box",
406 "image_bbox",
407 "pdf_bbox",
408 "rect",
409 "bounds",
410 "bounding_box",
411 ],
412 );
413
414 Some(Block::Text(TextBlock {
415 text,
416 kind: "word".to_owned(),
417 bbox,
418 lines: Vec::new(),
419 source_anchors: vec![source_anchor(page_number, bbox)],
420 confidence: Some(confidence()), ..Default::default()
421 }))
422}
423
424fn coco_pages(value: &Value) -> Option<Vec<Page>> {
425 let object = value.as_object()?;
426 let images = object.get("images")?.as_array()?;
427 let annotations = object.get("annotations")?.as_array()?;
428 let categories = coco_categories(object.get("categories").and_then(Value::as_array));
429 if images.is_empty() {
430 return None;
431 }
432
433 let mut annotations_by_image: HashMap<String, Vec<&Map<String, Value>>> = HashMap::new();
434 for annotation in annotations.iter().filter_map(Value::as_object) {
435 let Some(image_id) = annotation.get("image_id").map(value_key) else {
436 continue;
437 };
438 annotations_by_image
439 .entry(image_id)
440 .or_default()
441 .push(annotation);
442 }
443
444 let mut pages = Vec::new();
445 for (index, image) in images.iter().filter_map(Value::as_object).enumerate() {
446 let Some(image_id) = image.get("id").map(value_key) else {
447 continue;
448 };
449 let width = numeric_field(image, "width");
450 let height = numeric_field(image, "height");
451 let page_number = index + 1;
452 let mut page_annotations = annotations_by_image.remove(&image_id).unwrap_or_default();
453 page_annotations.sort_by(|left, right| {
454 let left_bbox = left.get("bbox").and_then(bbox_from_coco_rect);
455 let right_bbox = right.get("bbox").and_then(bbox_from_coco_rect);
456 match (left_bbox, right_bbox) {
457 (Some(left), Some(right)) => left
458 .y
459 .partial_cmp(&right.y)
460 .unwrap_or(Ordering::Equal)
461 .then_with(|| left.x.partial_cmp(&right.x).unwrap_or(Ordering::Equal)),
462 _ => Ordering::Equal,
463 }
464 });
465
466 let blocks = page_annotations
467 .into_iter()
468 .filter_map(|annotation| coco_block(annotation, &categories, page_number))
469 .collect::<Vec<_>>();
470 pages.push(Page {
471 number: page_number,
472 width,
473 height,
474 rotation: None,
475 bbox: page_bbox(width, height),
476 blocks,
477 images: Vec::new(),
478 assets: Vec::new(),
479 warnings: Vec::new(), ..Default::default()
480 });
481 }
482
483 (!pages.is_empty()).then_some(pages)
484}
485
486fn coco_categories(categories: Option<&Vec<Value>>) -> HashMap<String, String> {
487 let mut names = HashMap::new();
488 for category in categories
489 .into_iter()
490 .flatten()
491 .filter_map(Value::as_object)
492 {
493 let Some(id) = category.get("id").map(value_key) else {
494 continue;
495 };
496 let name = category
497 .get("name")
498 .and_then(Value::as_str)
499 .unwrap_or("layout")
500 .to_owned();
501 names.insert(id, name);
502 }
503 names
504}
505
506fn coco_block(
507 annotation: &Map<String, Value>,
508 categories: &HashMap<String, String>,
509 page_number: usize,
510) -> Option<Block> {
511 let bbox = annotation.get("bbox").and_then(bbox_from_coco_rect)?;
512 let category_id = annotation.get("category_id").map(value_key);
513 let kind = category_id
514 .as_ref()
515 .and_then(|id| categories.get(id))
516 .cloned()
517 .unwrap_or_else(|| "layout".to_owned());
518
519 Some(Block::Text(TextBlock {
520 text: kind.clone(),
521 kind,
522 bbox: Some(bbox),
523 lines: Vec::new(),
524 source_anchors: vec![source_anchor(page_number, Some(bbox))],
525 confidence: Some(confidence()), ..Default::default()
526 }))
527}
528
529fn funsd_page(value: &Value) -> Option<Page> {
530 let form = value.as_object()?.get("form")?.as_array()?;
531 let mut fields = form.iter().filter_map(Value::as_object).collect::<Vec<_>>();
532 if fields.is_empty() {
533 return None;
534 }
535 fields.sort_by(|left, right| {
536 let left_bbox = left.get("box").and_then(bbox_from_rect);
537 let right_bbox = right.get("box").and_then(bbox_from_rect);
538 match (left_bbox, right_bbox) {
539 (Some(left), Some(right)) => left
540 .y
541 .partial_cmp(&right.y)
542 .unwrap_or(Ordering::Equal)
543 .then_with(|| left.x.partial_cmp(&right.x).unwrap_or(Ordering::Equal)),
544 _ => Ordering::Equal,
545 }
546 });
547
548 let blocks = fields
549 .into_iter()
550 .filter_map(funsd_block)
551 .collect::<Vec<_>>();
552 if blocks.is_empty() {
553 return None;
554 }
555 let bbox = inferred_page_bbox(&blocks);
556
557 Some(Page {
558 number: 1,
559 width: bbox.map(|bbox| bbox.width),
560 height: bbox.map(|bbox| bbox.height),
561 rotation: None,
562 bbox,
563 blocks,
564 images: Vec::new(),
565 assets: Vec::new(),
566 warnings: Vec::new(), ..Default::default()
567 })
568}
569
570fn funsd_block(field: &Map<String, Value>) -> Option<Block> {
571 let text = field.get("text").and_then(Value::as_str).map(clean_text)?;
572 if text.is_empty() {
573 return None;
574 }
575 let bbox = field.get("box").and_then(bbox_from_rect);
576 let kind = field
577 .get("label")
578 .and_then(Value::as_str)
579 .unwrap_or("field")
580 .to_owned();
581
582 Some(Block::Text(TextBlock {
583 text,
584 kind,
585 bbox,
586 lines: Vec::new(),
587 source_anchors: vec![source_anchor(1, bbox)],
588 confidence: Some(confidence()), ..Default::default()
589 }))
590}
591
592fn omnidocbench_pages(value: &Value) -> Option<Vec<Page>> {
593 let items = match value {
594 Value::Array(items) => items.as_slice(),
595 Value::Object(object) => object.get("pages")?.as_array()?.as_slice(),
596 _ => return None,
597 };
598 if items
599 .iter()
600 .all(|item| item.get("layout_dets").and_then(Value::as_array).is_none())
601 {
602 return None;
603 }
604
605 let mut pages = Vec::new();
606 for (index, item) in items.iter().enumerate() {
607 let Some(object) = item.as_object() else {
608 continue;
609 };
610 let Some(layout_dets) = object.get("layout_dets").and_then(Value::as_array) else {
611 continue;
612 };
613 let page_info = object.get("page_info").and_then(Value::as_object);
614 let width = page_info.and_then(|info| numeric_field(info, "width"));
615 let height = page_info.and_then(|info| numeric_field(info, "height"));
616 let page_number = index + 1;
617 let mut detections = layout_dets
618 .iter()
619 .filter_map(Value::as_object)
620 .collect::<Vec<_>>();
621 detections.sort_by(|left, right| {
622 order_value(left)
623 .partial_cmp(&order_value(right))
624 .unwrap_or(Ordering::Equal)
625 });
626
627 let blocks = detections
628 .into_iter()
629 .filter(|detection| !bool_field(detection, "ignore"))
630 .filter_map(|detection| block_from_layout_detection(detection, page_number))
631 .collect::<Vec<_>>();
632
633 pages.push(Page {
634 number: page_number,
635 width,
636 height,
637 rotation: None,
638 bbox: page_bbox(width, height),
639 blocks,
640 images: Vec::new(),
641 assets: Vec::new(),
642 warnings: Vec::new(), ..Default::default()
643 });
644 }
645
646 (!pages.is_empty()).then_some(pages)
647}
648
649fn block_from_layout_detection(
650 detection: &Map<String, Value>,
651 page_number: usize,
652) -> Option<Block> {
653 let category = detection
654 .get("category_type")
655 .and_then(Value::as_str)
656 .unwrap_or("annotation");
657 let bbox = detection.get("poly").and_then(bbox_from_poly);
658
659 if category == "table" {
660 if let Some(html) = first_string_field(detection, &["html", "html_2", "html_3"]) {
661 let rows = html_table_rows(html);
662 if !rows.is_empty() {
663 let (headers, rows) = split_table_rows(rows);
664 return Some(Block::Table(TableBlock {
665 headers,
666 rows,
667 caption: None,
668 bbox,
669 cells: Vec::new(),
670 source_anchors: vec![source_anchor(page_number, bbox)],
671 confidence: Some(confidence()), ..Default::default()
672 }));
673 }
674 }
675 }
676
677 if let Some(text) = layout_detection_text(detection) {
678 return Some(Block::Text(TextBlock {
679 text,
680 kind: category.to_owned(),
681 bbox,
682 lines: Vec::new(),
683 source_anchors: vec![source_anchor(page_number, bbox)],
684 confidence: Some(confidence()), ..Default::default()
685 }));
686 }
687
688 if category == "figure" || category == "chart_mask" {
689 return Some(Block::Figure(FigureBlock {
690 alt_text: None,
691 caption: None,
692 bbox,
693 image_ref: None,
694 source_anchors: vec![source_anchor(page_number, bbox)],
695 confidence: Some(confidence()), ..Default::default()
696 }));
697 }
698
699 None
700}
701
702fn layout_detection_text(detection: &Map<String, Value>) -> Option<String> {
703 first_string_field(detection, &["text", "latex"])
704 .map(clean_text)
705 .filter(|text| !text.is_empty())
706 .or_else(|| {
707 first_string_field(detection, &["html", "html_2", "html_3"])
708 .map(html_to_text)
709 .map(|text| clean_text(&text))
710 .filter(|text| !text.is_empty())
711 })
712}
713
714fn generic_page_from_value(value: &Value, page_number: usize) -> Page {
715 let mut records = Vec::new();
716 collect_generic_text_records(value, &mut records);
717 if records.is_empty() {
718 if let Some(text) = scalar_text(value) {
719 records.push(TextRecord {
720 kind: "value".to_owned(),
721 text,
722 });
723 }
724 }
725
726 let blocks = records
727 .into_iter()
728 .filter(|record| !record.text.is_empty())
729 .map(|record| {
730 Block::Text(TextBlock {
731 text: record.text,
732 kind: record.kind,
733 bbox: None,
734 lines: Vec::new(),
735 source_anchors: vec![source_anchor(page_number, None)],
736 confidence: Some(confidence()), ..Default::default()
737 })
738 })
739 .collect();
740
741 Page {
742 number: page_number,
743 width: None,
744 height: None,
745 rotation: None,
746 bbox: None,
747 blocks,
748 images: Vec::new(),
749 assets: Vec::new(),
750 warnings: Vec::new(), ..Default::default()
751 }
752}
753
754fn collect_generic_text_records(value: &Value, records: &mut Vec<TextRecord>) {
755 match value {
756 Value::Object(object) => {
757 let before = records.len();
758 for key in [
759 "title",
760 "abstract",
761 "body_text",
762 "full_text",
763 "paragraphs",
764 "sections",
765 "content",
766 "body",
767 "text",
768 "latex",
769 "html",
770 "caption",
771 ] {
772 if let Some(child) = object.get(key) {
773 collect_value_for_text_key(key, child, records);
774 }
775 }
776 if records.len() != before {
777 return;
778 }
779
780 for (key, child) in object {
781 if should_recurse_generic_key(key) {
782 collect_generic_text_records(child, records);
783 }
784 }
785 }
786 Value::Array(items) => {
787 for item in items {
788 collect_generic_text_records(item, records);
789 }
790 }
791 Value::String(text) => push_record(records, "text", text),
792 _ => {}
793 }
794}
795
796fn collect_value_for_text_key(key: &str, value: &Value, records: &mut Vec<TextRecord>) {
797 match value {
798 Value::String(text) => {
799 let text = if key == "html" {
800 html_to_text(text)
801 } else {
802 text.clone()
803 };
804 push_record(records, normalized_kind(key), &text);
805 }
806 Value::Array(items) => {
807 for item in items {
808 match item {
809 Value::String(text) => push_record(records, normalized_kind(key), text),
810 Value::Object(_) => collect_generic_text_records(item, records),
811 _ => {}
812 }
813 }
814 }
815 Value::Object(_) => collect_generic_text_records(value, records),
816 _ => {}
817 }
818}
819
820fn push_record(records: &mut Vec<TextRecord>, kind: &str, text: &str) {
821 let text = clean_text(text);
822 if !text.is_empty() {
823 records.push(TextRecord {
824 kind: kind.to_owned(),
825 text,
826 });
827 }
828}
829
830fn build_document(source: &Source, engine_name: &str, mut pages: Vec<Page>) -> Document {
831 if pages.is_empty() {
832 pages.push(Page {
833 number: 1,
834 width: None,
835 height: None,
836 rotation: None,
837 bbox: None,
838 blocks: Vec::new(),
839 images: Vec::new(),
840 assets: Vec::new(),
841 warnings: Vec::new(), ..Default::default()
842 });
843 }
844
845 let (character_count, word_count, block_count) = document_counts(&pages);
846 let title = first_title(&pages);
847 Document {
848 schema_version: SCHEMA_VERSION.to_owned(),
849 metadata: Metadata {
850 format: source.format.clone(),
851 engine: engine_name.to_owned(),
852 source: source.path.clone(),
853 title,
854 character_count,
855 word_count,
856 block_count,
857 file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
858 pdf_version: None,
859 encrypted: false,
860 },
861 pages,
862 assets: Vec::new(),
863 warnings: Vec::new(),
864 }
865}
866
867fn document_counts(pages: &[Page]) -> (usize, usize, usize) {
868 let mut character_count = 0;
869 let mut word_count = 0;
870 let mut block_count = 0;
871 for page in pages {
872 for block in &page.blocks {
873 let text = block_text(block);
874 character_count += text.chars().count();
875 word_count += text.split_whitespace().count();
876 block_count += 1;
877 }
878 }
879 (character_count, word_count, block_count)
880}
881
882fn first_title(pages: &[Page]) -> Option<String> {
883 pages.iter().find_map(|page| {
884 page.blocks.iter().find_map(|block| match block {
885 Block::Text(text) if text.kind == "title" => Some(text.text.clone()),
886 _ => None,
887 })
888 })
889}
890
891fn block_text(block: &Block) -> String {
892 match block {
893 Block::Text(text) => text.text.clone(),
894 Block::Table(table) => {
895 let mut rows = Vec::new();
896 if !table.headers.is_empty() {
897 rows.push(table.headers.join(" "));
898 }
899 rows.extend(table.rows.iter().map(|row| row.join(" ")));
900 rows.join("\n")
901 }
902 Block::Figure(figure) => figure.caption.clone().unwrap_or_default(),
903 }
904}
905
906fn block_bbox(block: &Block) -> Option<BBox> {
907 match block {
908 Block::Text(text) => text.bbox,
909 Block::Table(table) => table.bbox,
910 Block::Figure(figure) => figure.bbox,
911 }
912}
913
914fn html_table_rows(html: &str) -> Vec<Vec<String>> {
915 let lower = html.to_ascii_lowercase();
916 let mut rows = Vec::new();
917 let mut pos = 0;
918
919 while let Some(row_start_offset) = lower[pos..].find("<tr") {
920 let row_start = pos + row_start_offset;
921 let Some(open_end_offset) = lower[row_start..].find('>') else {
922 break;
923 };
924 let content_start = row_start + open_end_offset + 1;
925 let Some(close_offset) = lower[content_start..].find("</tr>") else {
926 break;
927 };
928 let content_end = content_start + close_offset;
929 let row = html_row_cells(&html[content_start..content_end]);
930 if !row.is_empty() {
931 rows.push(row);
932 }
933 pos = content_end + "</tr>".len();
934 }
935
936 rows
937}
938
939fn html_row_cells(row_html: &str) -> Vec<String> {
940 let lower = row_html.to_ascii_lowercase();
941 let mut cells = Vec::new();
942 let mut pos = 0;
943
944 while let Some((tag, cell_start_offset)) = next_cell_tag(&lower[pos..]) {
945 let cell_start = pos + cell_start_offset;
946 let Some(open_end_offset) = lower[cell_start..].find('>') else {
947 break;
948 };
949 let content_start = cell_start + open_end_offset + 1;
950 let close_tag = format!("</{tag}>");
951 let Some(close_offset) = lower[content_start..].find(&close_tag) else {
952 break;
953 };
954 let content_end = content_start + close_offset;
955 let text = clean_text(&html_to_text(&row_html[content_start..content_end]));
956 cells.push(text);
957 pos = content_end + close_tag.len();
958 }
959
960 cells
961}
962
963fn next_cell_tag(input: &str) -> Option<(&'static str, usize)> {
964 let td = input.find("<td").map(|index| ("td", index));
965 let th = input.find("<th").map(|index| ("th", index));
966 match (td, th) {
967 (Some(left), Some(right)) => Some(if left.1 <= right.1 { left } else { right }),
968 (Some(left), None) => Some(left),
969 (None, Some(right)) => Some(right),
970 (None, None) => None,
971 }
972}
973
974fn split_table_rows(mut rows: Vec<Vec<String>>) -> (Vec<String>, Vec<Vec<String>>) {
975 if rows.is_empty() {
976 return (Vec::new(), Vec::new());
977 }
978 let headers = rows.remove(0);
979 (headers, rows)
980}
981
982fn source_anchor(page_number: usize, bbox: Option<BBox>) -> SourceAnchor {
983 SourceAnchor {
984 page_number,
985 pdf_object_ids: Vec::new(),
986 bbox,
987 extraction_method: EXTRACTION_METHOD.to_owned(),
988 }
989}
990
991fn confidence() -> Confidence {
992 Confidence {
993 score: 0.9,
994 calibrated: false,
995 }
996}
997
998fn bbox_from_poly(value: &Value) -> Option<BBox> {
999 let points = value.as_array()?;
1000 if points.len() < 4 {
1001 return None;
1002 }
1003
1004 let mut xs = Vec::new();
1005 let mut ys = Vec::new();
1006 for pair in points.chunks(2) {
1007 if pair.len() != 2 {
1008 continue;
1009 }
1010 xs.push(pair[0].as_f64()? as f32);
1011 ys.push(pair[1].as_f64()? as f32);
1012 }
1013 if xs.is_empty() || ys.is_empty() {
1014 return None;
1015 }
1016 let min_x = xs.iter().copied().fold(f32::INFINITY, f32::min);
1017 let max_x = xs.iter().copied().fold(f32::NEG_INFINITY, f32::max);
1018 let min_y = ys.iter().copied().fold(f32::INFINITY, f32::min);
1019 let max_y = ys.iter().copied().fold(f32::NEG_INFINITY, f32::max);
1020 Some(BBox {
1021 x: min_x,
1022 y: min_y,
1023 width: max_x - min_x,
1024 height: max_y - min_y,
1025 })
1026}
1027
1028fn bbox_from_rect(value: &Value) -> Option<BBox> {
1029 let coordinates = value.as_array()?;
1030 if coordinates.len() < 4 {
1031 return None;
1032 }
1033 let left = coordinates[0].as_f64()? as f32;
1034 let top = coordinates[1].as_f64()? as f32;
1035 let right = coordinates[2].as_f64()? as f32;
1036 let bottom = coordinates[3].as_f64()? as f32;
1037 Some(BBox {
1038 x: left.min(right),
1039 y: top.min(bottom),
1040 width: (right - left).abs(),
1041 height: (bottom - top).abs(),
1042 })
1043}
1044
1045fn bbox_from_coco_rect(value: &Value) -> Option<BBox> {
1046 let coordinates = value.as_array()?;
1047 if coordinates.len() != 4 {
1048 return None;
1049 }
1050 Some(BBox {
1051 x: coordinates[0].as_f64()? as f32,
1052 y: coordinates[1].as_f64()? as f32,
1053 width: coordinates[2].as_f64()? as f32,
1054 height: coordinates[3].as_f64()? as f32,
1055 })
1056}
1057
1058fn inferred_page_bbox(blocks: &[Block]) -> Option<BBox> {
1059 let mut max_x = 0.0f32;
1060 let mut max_y = 0.0f32;
1061 let mut has_bbox = false;
1062 for block in blocks {
1063 let bbox = match block {
1064 Block::Text(text) => text.bbox,
1065 Block::Table(table) => table.bbox,
1066 Block::Figure(figure) => figure.bbox,
1067 };
1068 let Some(bbox) = bbox else {
1069 continue;
1070 };
1071 has_bbox = true;
1072 max_x = max_x.max(bbox.x + bbox.width);
1073 max_y = max_y.max(bbox.y + bbox.height);
1074 }
1075
1076 has_bbox.then_some(BBox {
1077 x: 0.0,
1078 y: 0.0,
1079 width: max_x,
1080 height: max_y,
1081 })
1082}
1083
1084fn page_bbox(width: Option<f32>, height: Option<f32>) -> Option<BBox> {
1085 Some(BBox {
1086 x: 0.0,
1087 y: 0.0,
1088 width: width?,
1089 height: height?,
1090 })
1091}
1092
1093fn order_value(object: &Map<String, Value>) -> f64 {
1094 object
1095 .get("order")
1096 .and_then(Value::as_f64)
1097 .unwrap_or(f64::INFINITY)
1098}
1099
1100fn numeric_field(object: &Map<String, Value>, key: &str) -> Option<f32> {
1101 object.get(key)?.as_f64().map(|value| value as f32)
1102}
1103
1104fn bool_field(object: &Map<String, Value>, key: &str) -> bool {
1105 object.get(key).and_then(Value::as_bool).unwrap_or(false)
1106}
1107
1108fn first_string_field<'a>(object: &'a Map<String, Value>, keys: &[&str]) -> Option<&'a str> {
1109 keys.iter()
1110 .find_map(|key| object.get(*key).and_then(Value::as_str))
1111}
1112
1113fn first_numeric_field(object: &Map<String, Value>, keys: &[&str]) -> Option<f32> {
1114 keys.iter().find_map(|key| numeric_field(object, key))
1115}
1116
1117fn first_bbox_field(object: &Map<String, Value>, keys: &[&str]) -> Option<BBox> {
1118 keys.iter().find_map(|key| {
1119 object
1120 .get(*key)
1121 .and_then(|value| bbox_from_rect(value).or_else(|| bbox_from_object(value)))
1122 })
1123}
1124
1125fn bbox_from_object(value: &Value) -> Option<BBox> {
1126 let object = value.as_object()?;
1127 if let (Some(left), Some(top), Some(right), Some(bottom)) = (
1128 first_numeric_field(object, &["x1", "left", "l"]),
1129 first_numeric_field(object, &["y1", "top", "t"]),
1130 first_numeric_field(object, &["x2", "right", "r"]),
1131 first_numeric_field(object, &["y2", "bottom", "b"]),
1132 ) {
1133 return Some(BBox {
1134 x: left.min(right),
1135 y: top.min(bottom),
1136 width: (right - left).abs(),
1137 height: (bottom - top).abs(),
1138 });
1139 }
1140
1141 let x = first_numeric_field(object, &["x", "left"])?;
1142 let y = first_numeric_field(object, &["y", "top"])?;
1143 let width = first_numeric_field(object, &["width", "w"])?;
1144 let height = first_numeric_field(object, &["height", "h"])?;
1145 Some(BBox {
1146 x,
1147 y,
1148 width,
1149 height,
1150 })
1151}
1152
1153fn value_key(value: &Value) -> String {
1154 match value {
1155 Value::String(text) => text.clone(),
1156 Value::Number(number) => number.to_string(),
1157 Value::Bool(boolean) => boolean.to_string(),
1158 _ => value.to_string(),
1159 }
1160}
1161
1162fn scalar_text(value: &Value) -> Option<String> {
1163 match value {
1164 Value::String(text) => Some(clean_text(text)),
1165 Value::Number(number) => Some(number.to_string()),
1166 Value::Bool(boolean) => Some(boolean.to_string()),
1167 _ => None,
1168 }
1169 .filter(|text| !text.is_empty())
1170}
1171
1172fn clean_text(text: &str) -> String {
1173 text.split_whitespace().collect::<Vec<_>>().join(" ")
1174}
1175
1176fn normalized_kind(key: &str) -> &str {
1177 match key {
1178 "body_text" | "full_text" | "content" | "body" => "paragraph",
1179 "paragraphs" | "sections" => "paragraph",
1180 other => other,
1181 }
1182}
1183
1184fn should_recurse_generic_key(key: &str) -> bool {
1185 !matches!(
1186 key,
1187 "id" | "anno_id"
1188 | "image"
1189 | "image_path"
1190 | "pdf"
1191 | "pdf_path"
1192 | "path"
1193 | "url"
1194 | "source"
1195 | "metadata"
1196 | "page_info"
1197 | "category_type"
1198 | "attribute"
1199 )
1200}