Skip to main content

dongler_core/
json.rs

1use std::cmp::Ordering;
2use std::collections::HashMap;
3
4use serde_json::{Map, Value};
5
6use crate::engine::ExtractionEngine;
7use crate::error::Result;
8use crate::ir::{
9    BBox, Block, Confidence, Document, FigureBlock, Metadata, Page, SourceAnchor, TableBlock,
10    TableCell, TextBlock, SCHEMA_VERSION,
11};
12use crate::source::Source;
13use crate::textual::html_to_text;
14
15const EXTRACTION_METHOD: &str = "json_native";
16
17#[derive(Debug, Default, Clone, Copy)]
18pub struct JsonEngine;
19
20impl ExtractionEngine for JsonEngine {
21    fn name(&self) -> &'static str {
22        "json-native"
23    }
24
25    fn extract(&self, source: &Source) -> Result<Document> {
26        let pages = parse_json_pages(&source.content)?;
27        Ok(build_document(source, self.name(), pages))
28    }
29}
30
31#[derive(Debug)]
32struct TextRecord {
33    kind: String,
34    text: String,
35}
36
37fn parse_json_pages(content: &str) -> Result<Vec<Page>> {
38    match serde_json::from_str::<Value>(content) {
39        Ok(value) => Ok(pages_from_json_value(&value)),
40        Err(json_error) => {
41            let mut pages = Vec::new();
42            for (index, line) in content.lines().enumerate() {
43                let trimmed = line.trim();
44                if trimmed.is_empty() {
45                    continue;
46                }
47                let value = serde_json::from_str::<Value>(trimmed)?;
48                let mut value_pages = pages_from_json_value(&value);
49                renumber_pages(&mut value_pages, index + 1);
50                pages.extend(value_pages);
51            }
52            if pages.is_empty() {
53                Err(json_error.into())
54            } else {
55                Ok(pages)
56            }
57        }
58    }
59}
60
61fn pages_from_json_value(value: &Value) -> Vec<Page> {
62    if let Some(pages) = omnidocbench_pages(value) {
63        return pages;
64    }
65    if let Some(page) = funsd_page(value) {
66        return vec![page];
67    }
68    if let Some(pages) = coco_pages(value) {
69        return pages;
70    }
71    if let Some(page) = pubtabnet_page(value, 1) {
72        return vec![page];
73    }
74    if let Some(page) = word_boxes_page(value, 1) {
75        return vec![page];
76    }
77    if let Some(page) = grid_cells_page(value, 1) {
78        return vec![page];
79    }
80
81    match value {
82        Value::Array(items) => items
83            .iter()
84            .enumerate()
85            .map(|(index, item)| generic_page_from_value(item, index + 1))
86            .collect(),
87        Value::Object(object) => {
88            if let Some(Value::Array(pages)) = object.get("pages") {
89                return pages
90                    .iter()
91                    .enumerate()
92                    .map(|(index, item)| generic_page_from_value(item, index + 1))
93                    .collect();
94            }
95            vec![generic_page_from_value(value, 1)]
96        }
97        _ => vec![generic_page_from_value(value, 1)],
98    }
99}
100
101fn grid_cells_page(value: &Value, page_number: usize) -> Option<Page> {
102    let object = value.as_object()?;
103    let cell_rows = object.get("cells")?.as_array()?;
104    let mut rows = Vec::new();
105    let mut table_cells = Vec::new();
106
107    for (row_index, row) in cell_rows.iter().enumerate() {
108        let Some(row_cells) = row.as_array() else {
109            continue;
110        };
111        let mut text_row = Vec::new();
112        for (column_index, cell) in row_cells.iter().enumerate() {
113            let text = pubtabnet_cell_text(Some(cell));
114            text_row.push(text.clone());
115            table_cells.push(TableCell {
116                row: row_index,
117                column: column_index,
118                text,
119                bbox: cell.get("bbox").and_then(bbox_from_rect),
120                is_header: row_index == 0,
121                col_span: 1,
122                row_span: 1,
123            });
124        }
125        if !text_row.is_empty() {
126            rows.push(text_row);
127        }
128    }
129
130    if rows.is_empty() {
131        return None;
132    }
133    let bbox = object
134        .get("table_bbox")
135        .and_then(bbox_from_rect)
136        .or_else(|| inferred_table_cell_bbox(&table_cells));
137    let (headers, rows) = split_table_rows(rows);
138
139    Some(Page {
140        number: page_number,
141        width: bbox.map(|bbox| bbox.x + bbox.width),
142        height: bbox.map(|bbox| bbox.y + bbox.height),
143        rotation: None,
144        route: None,
145        bbox: bbox.map(|bbox| BBox {
146            x: 0.0,
147            y: 0.0,
148            width: bbox.x + bbox.width,
149            height: bbox.y + bbox.height,
150        }),
151        blocks: vec![Block::Table(TableBlock {
152            headers,
153            rows,
154            caption: None,
155            bbox,
156            cells: table_cells,
157            source_anchors: vec![source_anchor(page_number, bbox)],
158            confidence: Some(confidence()), ..Default::default()
159        })],
160        images: Vec::new(),
161        assets: Vec::new(),
162        warnings: Vec::new(),
163    })
164}
165
166fn inferred_table_cell_bbox(cells: &[TableCell]) -> Option<BBox> {
167    let mut min_x = f32::INFINITY;
168    let mut min_y = f32::INFINITY;
169    let mut max_x = f32::NEG_INFINITY;
170    let mut max_y = f32::NEG_INFINITY;
171    let mut has_bbox = false;
172    for cell in cells {
173        let Some(bbox) = cell.bbox else {
174            continue;
175        };
176        has_bbox = true;
177        min_x = min_x.min(bbox.x);
178        min_y = min_y.min(bbox.y);
179        max_x = max_x.max(bbox.x + bbox.width);
180        max_y = max_y.max(bbox.y + bbox.height);
181    }
182    has_bbox.then_some(BBox {
183        x: min_x,
184        y: min_y,
185        width: max_x - min_x,
186        height: max_y - min_y,
187    })
188}
189
190fn renumber_pages(pages: &mut [Page], first_page_number: usize) {
191    for (offset, page) in pages.iter_mut().enumerate() {
192        let page_number = first_page_number + offset;
193        page.number = page_number;
194        for block in &mut page.blocks {
195            match block {
196                Block::Text(text) => {
197                    for anchor in &mut text.source_anchors {
198                        anchor.page_number = page_number;
199                    }
200                }
201                Block::Table(table) => {
202                    for anchor in &mut table.source_anchors {
203                        anchor.page_number = page_number;
204                    }
205                }
206                Block::Figure(figure) => {
207                    for anchor in &mut figure.source_anchors {
208                        anchor.page_number = page_number;
209                    }
210                }
211            }
212        }
213    }
214}
215
216fn pubtabnet_page(value: &Value, page_number: usize) -> Option<Page> {
217    let object = value.as_object()?;
218    let html = object.get("html")?.as_object()?;
219    let structure = html
220        .get("structure")
221        .and_then(Value::as_object)?
222        .get("tokens")
223        .and_then(Value::as_array)?;
224    let cells = html.get("cells").or_else(|| html.get("cell"))?.as_array()?;
225    let rows = pubtabnet_rows(structure, cells);
226    if rows.is_empty() {
227        return None;
228    }
229    let table_cells = pubtabnet_table_cells(&rows);
230    let bbox = inferred_table_cell_bbox(&table_cells);
231    let (headers, rows) = split_table_rows(
232        rows.iter()
233            .map(|row| row.cells.iter().map(|cell| cell.text.clone()).collect())
234            .collect(),
235    );
236
237    Some(Page {
238        number: page_number,
239        width: bbox.map(|bbox| bbox.x + bbox.width),
240        height: bbox.map(|bbox| bbox.y + bbox.height),
241        rotation: None,
242        route: None,
243        bbox: bbox.map(|bbox| BBox {
244            x: 0.0,
245            y: 0.0,
246            width: bbox.x + bbox.width,
247            height: bbox.y + bbox.height,
248        }),
249        blocks: vec![Block::Table(TableBlock {
250            headers,
251            rows,
252            caption: None,
253            bbox,
254            cells: table_cells,
255            source_anchors: vec![source_anchor(page_number, bbox)],
256            confidence: Some(confidence()), ..Default::default()
257        })],
258        images: Vec::new(),
259        assets: Vec::new(),
260        warnings: Vec::new(),
261    })
262}
263
264#[derive(Debug)]
265struct PubTabNetRow {
266    cells: Vec<PubTabNetCell>,
267}
268
269#[derive(Debug)]
270struct PubTabNetCell {
271    text: String,
272    bbox: Option<BBox>,
273}
274
275fn pubtabnet_rows(structure: &[Value], cells: &[Value]) -> Vec<PubTabNetRow> {
276    let mut rows = Vec::new();
277    let mut current_row: Option<PubTabNetRow> = None;
278    let mut cell_index = 0usize;
279
280    for token in structure.iter().filter_map(Value::as_str) {
281        let normalized = token.trim().to_ascii_lowercase();
282        if normalized.starts_with("<tr") && !normalized.starts_with("</") {
283            current_row = Some(PubTabNetRow { cells: Vec::new() });
284        } else if normalized.starts_with("</tr") {
285            if let Some(row) = current_row.take() {
286                if !row.cells.is_empty() {
287                    rows.push(row);
288                }
289            }
290        } else if is_pubtabnet_cell_open(&normalized) {
291            let Some(row) = current_row.as_mut() else {
292                continue;
293            };
294            row.cells.push(pubtabnet_cell(cells.get(cell_index)));
295            cell_index += 1;
296        }
297    }
298
299    rows
300}
301
302fn pubtabnet_table_cells(rows: &[PubTabNetRow]) -> Vec<TableCell> {
303    rows.iter()
304        .enumerate()
305        .flat_map(|(row_index, row)| {
306            row.cells
307                .iter()
308                .enumerate()
309                .map(move |(column_index, cell)| TableCell {
310                    row: row_index,
311                    column: column_index,
312                    text: cell.text.clone(),
313                    bbox: cell.bbox,
314                    is_header: row_index == 0,
315                    col_span: 1,
316                    row_span: 1,
317                })
318        })
319        .collect()
320}
321
322fn is_pubtabnet_cell_open(token: &str) -> bool {
323    (token.starts_with("<td") || token.starts_with("<th")) && !token.starts_with("</")
324}
325
326fn pubtabnet_cell_text(cell: Option<&Value>) -> String {
327    let Some(cell) = cell.and_then(Value::as_object) else {
328        return String::new();
329    };
330    let text = cell
331        .get("tokens")
332        .and_then(Value::as_array)
333        .map(|tokens| {
334            tokens
335                .iter()
336                .filter_map(Value::as_str)
337                .collect::<Vec<_>>()
338                .join("")
339        })
340        .or_else(|| cell.get("text").and_then(Value::as_str).map(str::to_owned))
341        .unwrap_or_default();
342    clean_text(&html_to_text(&text))
343}
344
345fn pubtabnet_cell(cell: Option<&Value>) -> PubTabNetCell {
346    PubTabNetCell {
347        text: pubtabnet_cell_text(cell),
348        bbox: cell
349            .and_then(Value::as_object)
350            .and_then(|cell| cell.get("bbox"))
351            .and_then(bbox_from_rect),
352    }
353}
354
355fn word_boxes_page(value: &Value, page_number: usize) -> Option<Page> {
356    let object = value.as_object()?;
357    let words = object.get("words")?.as_array()?;
358    let mut blocks = words
359        .iter()
360        .filter_map(|word| word.as_object())
361        .filter_map(|word| word_box_block(word, page_number))
362        .collect::<Vec<_>>();
363    if blocks.is_empty() {
364        return None;
365    }
366    blocks.sort_by(|left, right| {
367        let left_bbox = block_bbox(left);
368        let right_bbox = block_bbox(right);
369        match (left_bbox, right_bbox) {
370            (Some(left), Some(right)) => left
371                .y
372                .partial_cmp(&right.y)
373                .unwrap_or(Ordering::Equal)
374                .then_with(|| left.x.partial_cmp(&right.x).unwrap_or(Ordering::Equal)),
375            _ => Ordering::Equal,
376        }
377    });
378
379    let width = first_numeric_field(object, &["image_width", "page_width", "width"]);
380    let height = first_numeric_field(object, &["image_height", "page_height", "height"]);
381    let bbox = page_bbox(width, height).or_else(|| inferred_page_bbox(&blocks));
382
383    Some(Page {
384        number: page_number,
385        width: width.or_else(|| bbox.map(|bbox| bbox.width)),
386        height: height.or_else(|| bbox.map(|bbox| bbox.height)),
387        rotation: None,
388        bbox,
389        blocks,
390        images: Vec::new(),
391        assets: Vec::new(),
392        warnings: Vec::new(), ..Default::default()
393    })
394}
395
396fn word_box_block(word: &Map<String, Value>, page_number: usize) -> Option<Block> {
397    let text = first_string_field(word, &["text", "word", "value"]).map(clean_text)?;
398    if text.is_empty() {
399        return None;
400    }
401    let bbox = first_bbox_field(
402        word,
403        &[
404            "bbox",
405            "box",
406            "image_bbox",
407            "pdf_bbox",
408            "rect",
409            "bounds",
410            "bounding_box",
411        ],
412    );
413
414    Some(Block::Text(TextBlock {
415        text,
416        kind: "word".to_owned(),
417        bbox,
418        lines: Vec::new(),
419        source_anchors: vec![source_anchor(page_number, bbox)],
420        confidence: Some(confidence()), ..Default::default()
421    }))
422}
423
424fn coco_pages(value: &Value) -> Option<Vec<Page>> {
425    let object = value.as_object()?;
426    let images = object.get("images")?.as_array()?;
427    let annotations = object.get("annotations")?.as_array()?;
428    let categories = coco_categories(object.get("categories").and_then(Value::as_array));
429    if images.is_empty() {
430        return None;
431    }
432
433    let mut annotations_by_image: HashMap<String, Vec<&Map<String, Value>>> = HashMap::new();
434    for annotation in annotations.iter().filter_map(Value::as_object) {
435        let Some(image_id) = annotation.get("image_id").map(value_key) else {
436            continue;
437        };
438        annotations_by_image
439            .entry(image_id)
440            .or_default()
441            .push(annotation);
442    }
443
444    let mut pages = Vec::new();
445    for (index, image) in images.iter().filter_map(Value::as_object).enumerate() {
446        let Some(image_id) = image.get("id").map(value_key) else {
447            continue;
448        };
449        let width = numeric_field(image, "width");
450        let height = numeric_field(image, "height");
451        let page_number = index + 1;
452        let mut page_annotations = annotations_by_image.remove(&image_id).unwrap_or_default();
453        page_annotations.sort_by(|left, right| {
454            let left_bbox = left.get("bbox").and_then(bbox_from_coco_rect);
455            let right_bbox = right.get("bbox").and_then(bbox_from_coco_rect);
456            match (left_bbox, right_bbox) {
457                (Some(left), Some(right)) => left
458                    .y
459                    .partial_cmp(&right.y)
460                    .unwrap_or(Ordering::Equal)
461                    .then_with(|| left.x.partial_cmp(&right.x).unwrap_or(Ordering::Equal)),
462                _ => Ordering::Equal,
463            }
464        });
465
466        let blocks = page_annotations
467            .into_iter()
468            .filter_map(|annotation| coco_block(annotation, &categories, page_number))
469            .collect::<Vec<_>>();
470        pages.push(Page {
471            number: page_number,
472            width,
473            height,
474            rotation: None,
475            bbox: page_bbox(width, height),
476            blocks,
477            images: Vec::new(),
478            assets: Vec::new(),
479            warnings: Vec::new(), ..Default::default()
480        });
481    }
482
483    (!pages.is_empty()).then_some(pages)
484}
485
486fn coco_categories(categories: Option<&Vec<Value>>) -> HashMap<String, String> {
487    let mut names = HashMap::new();
488    for category in categories
489        .into_iter()
490        .flatten()
491        .filter_map(Value::as_object)
492    {
493        let Some(id) = category.get("id").map(value_key) else {
494            continue;
495        };
496        let name = category
497            .get("name")
498            .and_then(Value::as_str)
499            .unwrap_or("layout")
500            .to_owned();
501        names.insert(id, name);
502    }
503    names
504}
505
506fn coco_block(
507    annotation: &Map<String, Value>,
508    categories: &HashMap<String, String>,
509    page_number: usize,
510) -> Option<Block> {
511    let bbox = annotation.get("bbox").and_then(bbox_from_coco_rect)?;
512    let category_id = annotation.get("category_id").map(value_key);
513    let kind = category_id
514        .as_ref()
515        .and_then(|id| categories.get(id))
516        .cloned()
517        .unwrap_or_else(|| "layout".to_owned());
518
519    Some(Block::Text(TextBlock {
520        text: kind.clone(),
521        kind,
522        bbox: Some(bbox),
523        lines: Vec::new(),
524        source_anchors: vec![source_anchor(page_number, Some(bbox))],
525        confidence: Some(confidence()), ..Default::default()
526    }))
527}
528
529fn funsd_page(value: &Value) -> Option<Page> {
530    let form = value.as_object()?.get("form")?.as_array()?;
531    let mut fields = form.iter().filter_map(Value::as_object).collect::<Vec<_>>();
532    if fields.is_empty() {
533        return None;
534    }
535    fields.sort_by(|left, right| {
536        let left_bbox = left.get("box").and_then(bbox_from_rect);
537        let right_bbox = right.get("box").and_then(bbox_from_rect);
538        match (left_bbox, right_bbox) {
539            (Some(left), Some(right)) => left
540                .y
541                .partial_cmp(&right.y)
542                .unwrap_or(Ordering::Equal)
543                .then_with(|| left.x.partial_cmp(&right.x).unwrap_or(Ordering::Equal)),
544            _ => Ordering::Equal,
545        }
546    });
547
548    let blocks = fields
549        .into_iter()
550        .filter_map(funsd_block)
551        .collect::<Vec<_>>();
552    if blocks.is_empty() {
553        return None;
554    }
555    let bbox = inferred_page_bbox(&blocks);
556
557    Some(Page {
558        number: 1,
559        width: bbox.map(|bbox| bbox.width),
560        height: bbox.map(|bbox| bbox.height),
561        rotation: None,
562        bbox,
563        blocks,
564        images: Vec::new(),
565        assets: Vec::new(),
566        warnings: Vec::new(), ..Default::default()
567    })
568}
569
570fn funsd_block(field: &Map<String, Value>) -> Option<Block> {
571    let text = field.get("text").and_then(Value::as_str).map(clean_text)?;
572    if text.is_empty() {
573        return None;
574    }
575    let bbox = field.get("box").and_then(bbox_from_rect);
576    let kind = field
577        .get("label")
578        .and_then(Value::as_str)
579        .unwrap_or("field")
580        .to_owned();
581
582    Some(Block::Text(TextBlock {
583        text,
584        kind,
585        bbox,
586        lines: Vec::new(),
587        source_anchors: vec![source_anchor(1, bbox)],
588        confidence: Some(confidence()), ..Default::default()
589    }))
590}
591
592fn omnidocbench_pages(value: &Value) -> Option<Vec<Page>> {
593    let items = match value {
594        Value::Array(items) => items.as_slice(),
595        Value::Object(object) => object.get("pages")?.as_array()?.as_slice(),
596        _ => return None,
597    };
598    if items
599        .iter()
600        .all(|item| item.get("layout_dets").and_then(Value::as_array).is_none())
601    {
602        return None;
603    }
604
605    let mut pages = Vec::new();
606    for (index, item) in items.iter().enumerate() {
607        let Some(object) = item.as_object() else {
608            continue;
609        };
610        let Some(layout_dets) = object.get("layout_dets").and_then(Value::as_array) else {
611            continue;
612        };
613        let page_info = object.get("page_info").and_then(Value::as_object);
614        let width = page_info.and_then(|info| numeric_field(info, "width"));
615        let height = page_info.and_then(|info| numeric_field(info, "height"));
616        let page_number = index + 1;
617        let mut detections = layout_dets
618            .iter()
619            .filter_map(Value::as_object)
620            .collect::<Vec<_>>();
621        detections.sort_by(|left, right| {
622            order_value(left)
623                .partial_cmp(&order_value(right))
624                .unwrap_or(Ordering::Equal)
625        });
626
627        let blocks = detections
628            .into_iter()
629            .filter(|detection| !bool_field(detection, "ignore"))
630            .filter_map(|detection| block_from_layout_detection(detection, page_number))
631            .collect::<Vec<_>>();
632
633        pages.push(Page {
634            number: page_number,
635            width,
636            height,
637            rotation: None,
638            bbox: page_bbox(width, height),
639            blocks,
640            images: Vec::new(),
641            assets: Vec::new(),
642            warnings: Vec::new(), ..Default::default()
643        });
644    }
645
646    (!pages.is_empty()).then_some(pages)
647}
648
649fn block_from_layout_detection(
650    detection: &Map<String, Value>,
651    page_number: usize,
652) -> Option<Block> {
653    let category = detection
654        .get("category_type")
655        .and_then(Value::as_str)
656        .unwrap_or("annotation");
657    let bbox = detection.get("poly").and_then(bbox_from_poly);
658
659    if category == "table" {
660        if let Some(html) = first_string_field(detection, &["html", "html_2", "html_3"]) {
661            let rows = html_table_rows(html);
662            if !rows.is_empty() {
663                let (headers, rows) = split_table_rows(rows);
664                return Some(Block::Table(TableBlock {
665                    headers,
666                    rows,
667                    caption: None,
668                    bbox,
669                    cells: Vec::new(),
670                    source_anchors: vec![source_anchor(page_number, bbox)],
671                    confidence: Some(confidence()), ..Default::default()
672                }));
673            }
674        }
675    }
676
677    if let Some(text) = layout_detection_text(detection) {
678        return Some(Block::Text(TextBlock {
679            text,
680            kind: category.to_owned(),
681            bbox,
682            lines: Vec::new(),
683            source_anchors: vec![source_anchor(page_number, bbox)],
684            confidence: Some(confidence()), ..Default::default()
685        }));
686    }
687
688    if category == "figure" || category == "chart_mask" {
689        return Some(Block::Figure(FigureBlock {
690            alt_text: None,
691            caption: None,
692            bbox,
693            image_ref: None,
694            source_anchors: vec![source_anchor(page_number, bbox)],
695            confidence: Some(confidence()), ..Default::default()
696        }));
697    }
698
699    None
700}
701
702fn layout_detection_text(detection: &Map<String, Value>) -> Option<String> {
703    first_string_field(detection, &["text", "latex"])
704        .map(clean_text)
705        .filter(|text| !text.is_empty())
706        .or_else(|| {
707            first_string_field(detection, &["html", "html_2", "html_3"])
708                .map(html_to_text)
709                .map(|text| clean_text(&text))
710                .filter(|text| !text.is_empty())
711        })
712}
713
714fn generic_page_from_value(value: &Value, page_number: usize) -> Page {
715    let mut records = Vec::new();
716    collect_generic_text_records(value, &mut records);
717    if records.is_empty() {
718        if let Some(text) = scalar_text(value) {
719            records.push(TextRecord {
720                kind: "value".to_owned(),
721                text,
722            });
723        }
724    }
725
726    let blocks = records
727        .into_iter()
728        .filter(|record| !record.text.is_empty())
729        .map(|record| {
730            Block::Text(TextBlock {
731                text: record.text,
732                kind: record.kind,
733                bbox: None,
734                lines: Vec::new(),
735                source_anchors: vec![source_anchor(page_number, None)],
736                confidence: Some(confidence()), ..Default::default()
737            })
738        })
739        .collect();
740
741    Page {
742        number: page_number,
743        width: None,
744        height: None,
745        rotation: None,
746        bbox: None,
747        blocks,
748        images: Vec::new(),
749        assets: Vec::new(),
750        warnings: Vec::new(), ..Default::default()
751    }
752}
753
754fn collect_generic_text_records(value: &Value, records: &mut Vec<TextRecord>) {
755    match value {
756        Value::Object(object) => {
757            let before = records.len();
758            for key in [
759                "title",
760                "abstract",
761                "body_text",
762                "full_text",
763                "paragraphs",
764                "sections",
765                "content",
766                "body",
767                "text",
768                "latex",
769                "html",
770                "caption",
771            ] {
772                if let Some(child) = object.get(key) {
773                    collect_value_for_text_key(key, child, records);
774                }
775            }
776            if records.len() != before {
777                return;
778            }
779
780            for (key, child) in object {
781                if should_recurse_generic_key(key) {
782                    collect_generic_text_records(child, records);
783                }
784            }
785        }
786        Value::Array(items) => {
787            for item in items {
788                collect_generic_text_records(item, records);
789            }
790        }
791        Value::String(text) => push_record(records, "text", text),
792        _ => {}
793    }
794}
795
796fn collect_value_for_text_key(key: &str, value: &Value, records: &mut Vec<TextRecord>) {
797    match value {
798        Value::String(text) => {
799            let text = if key == "html" {
800                html_to_text(text)
801            } else {
802                text.clone()
803            };
804            push_record(records, normalized_kind(key), &text);
805        }
806        Value::Array(items) => {
807            for item in items {
808                match item {
809                    Value::String(text) => push_record(records, normalized_kind(key), text),
810                    Value::Object(_) => collect_generic_text_records(item, records),
811                    _ => {}
812                }
813            }
814        }
815        Value::Object(_) => collect_generic_text_records(value, records),
816        _ => {}
817    }
818}
819
820fn push_record(records: &mut Vec<TextRecord>, kind: &str, text: &str) {
821    let text = clean_text(text);
822    if !text.is_empty() {
823        records.push(TextRecord {
824            kind: kind.to_owned(),
825            text,
826        });
827    }
828}
829
830fn build_document(source: &Source, engine_name: &str, mut pages: Vec<Page>) -> Document {
831    if pages.is_empty() {
832        pages.push(Page {
833            number: 1,
834            width: None,
835            height: None,
836            rotation: None,
837            bbox: None,
838            blocks: Vec::new(),
839            images: Vec::new(),
840            assets: Vec::new(),
841            warnings: Vec::new(), ..Default::default()
842        });
843    }
844
845    let (character_count, word_count, block_count) = document_counts(&pages);
846    let title = first_title(&pages);
847    Document {
848        schema_version: SCHEMA_VERSION.to_owned(),
849        metadata: Metadata {
850            format: source.format.clone(),
851            engine: engine_name.to_owned(),
852            source: source.path.clone(),
853            title,
854            character_count,
855            word_count,
856            block_count,
857            file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
858            pdf_version: None,
859            encrypted: false,
860        },
861        pages,
862        assets: Vec::new(),
863        warnings: Vec::new(),
864    }
865}
866
867fn document_counts(pages: &[Page]) -> (usize, usize, usize) {
868    let mut character_count = 0;
869    let mut word_count = 0;
870    let mut block_count = 0;
871    for page in pages {
872        for block in &page.blocks {
873            let text = block_text(block);
874            character_count += text.chars().count();
875            word_count += text.split_whitespace().count();
876            block_count += 1;
877        }
878    }
879    (character_count, word_count, block_count)
880}
881
882fn first_title(pages: &[Page]) -> Option<String> {
883    pages.iter().find_map(|page| {
884        page.blocks.iter().find_map(|block| match block {
885            Block::Text(text) if text.kind == "title" => Some(text.text.clone()),
886            _ => None,
887        })
888    })
889}
890
891fn block_text(block: &Block) -> String {
892    match block {
893        Block::Text(text) => text.text.clone(),
894        Block::Table(table) => {
895            let mut rows = Vec::new();
896            if !table.headers.is_empty() {
897                rows.push(table.headers.join(" "));
898            }
899            rows.extend(table.rows.iter().map(|row| row.join(" ")));
900            rows.join("\n")
901        }
902        Block::Figure(figure) => figure.caption.clone().unwrap_or_default(),
903    }
904}
905
906fn block_bbox(block: &Block) -> Option<BBox> {
907    match block {
908        Block::Text(text) => text.bbox,
909        Block::Table(table) => table.bbox,
910        Block::Figure(figure) => figure.bbox,
911    }
912}
913
914fn html_table_rows(html: &str) -> Vec<Vec<String>> {
915    let lower = html.to_ascii_lowercase();
916    let mut rows = Vec::new();
917    let mut pos = 0;
918
919    while let Some(row_start_offset) = lower[pos..].find("<tr") {
920        let row_start = pos + row_start_offset;
921        let Some(open_end_offset) = lower[row_start..].find('>') else {
922            break;
923        };
924        let content_start = row_start + open_end_offset + 1;
925        let Some(close_offset) = lower[content_start..].find("</tr>") else {
926            break;
927        };
928        let content_end = content_start + close_offset;
929        let row = html_row_cells(&html[content_start..content_end]);
930        if !row.is_empty() {
931            rows.push(row);
932        }
933        pos = content_end + "</tr>".len();
934    }
935
936    rows
937}
938
939fn html_row_cells(row_html: &str) -> Vec<String> {
940    let lower = row_html.to_ascii_lowercase();
941    let mut cells = Vec::new();
942    let mut pos = 0;
943
944    while let Some((tag, cell_start_offset)) = next_cell_tag(&lower[pos..]) {
945        let cell_start = pos + cell_start_offset;
946        let Some(open_end_offset) = lower[cell_start..].find('>') else {
947            break;
948        };
949        let content_start = cell_start + open_end_offset + 1;
950        let close_tag = format!("</{tag}>");
951        let Some(close_offset) = lower[content_start..].find(&close_tag) else {
952            break;
953        };
954        let content_end = content_start + close_offset;
955        let text = clean_text(&html_to_text(&row_html[content_start..content_end]));
956        cells.push(text);
957        pos = content_end + close_tag.len();
958    }
959
960    cells
961}
962
963fn next_cell_tag(input: &str) -> Option<(&'static str, usize)> {
964    let td = input.find("<td").map(|index| ("td", index));
965    let th = input.find("<th").map(|index| ("th", index));
966    match (td, th) {
967        (Some(left), Some(right)) => Some(if left.1 <= right.1 { left } else { right }),
968        (Some(left), None) => Some(left),
969        (None, Some(right)) => Some(right),
970        (None, None) => None,
971    }
972}
973
974fn split_table_rows(mut rows: Vec<Vec<String>>) -> (Vec<String>, Vec<Vec<String>>) {
975    if rows.is_empty() {
976        return (Vec::new(), Vec::new());
977    }
978    let headers = rows.remove(0);
979    (headers, rows)
980}
981
982fn source_anchor(page_number: usize, bbox: Option<BBox>) -> SourceAnchor {
983    SourceAnchor {
984        page_number,
985        pdf_object_ids: Vec::new(),
986        bbox,
987        extraction_method: EXTRACTION_METHOD.to_owned(),
988    }
989}
990
991fn confidence() -> Confidence {
992    Confidence {
993        score: 0.9,
994        calibrated: false,
995    }
996}
997
998fn bbox_from_poly(value: &Value) -> Option<BBox> {
999    let points = value.as_array()?;
1000    if points.len() < 4 {
1001        return None;
1002    }
1003
1004    let mut xs = Vec::new();
1005    let mut ys = Vec::new();
1006    for pair in points.chunks(2) {
1007        if pair.len() != 2 {
1008            continue;
1009        }
1010        xs.push(pair[0].as_f64()? as f32);
1011        ys.push(pair[1].as_f64()? as f32);
1012    }
1013    if xs.is_empty() || ys.is_empty() {
1014        return None;
1015    }
1016    let min_x = xs.iter().copied().fold(f32::INFINITY, f32::min);
1017    let max_x = xs.iter().copied().fold(f32::NEG_INFINITY, f32::max);
1018    let min_y = ys.iter().copied().fold(f32::INFINITY, f32::min);
1019    let max_y = ys.iter().copied().fold(f32::NEG_INFINITY, f32::max);
1020    Some(BBox {
1021        x: min_x,
1022        y: min_y,
1023        width: max_x - min_x,
1024        height: max_y - min_y,
1025    })
1026}
1027
1028fn bbox_from_rect(value: &Value) -> Option<BBox> {
1029    let coordinates = value.as_array()?;
1030    if coordinates.len() < 4 {
1031        return None;
1032    }
1033    let left = coordinates[0].as_f64()? as f32;
1034    let top = coordinates[1].as_f64()? as f32;
1035    let right = coordinates[2].as_f64()? as f32;
1036    let bottom = coordinates[3].as_f64()? as f32;
1037    Some(BBox {
1038        x: left.min(right),
1039        y: top.min(bottom),
1040        width: (right - left).abs(),
1041        height: (bottom - top).abs(),
1042    })
1043}
1044
1045fn bbox_from_coco_rect(value: &Value) -> Option<BBox> {
1046    let coordinates = value.as_array()?;
1047    if coordinates.len() != 4 {
1048        return None;
1049    }
1050    Some(BBox {
1051        x: coordinates[0].as_f64()? as f32,
1052        y: coordinates[1].as_f64()? as f32,
1053        width: coordinates[2].as_f64()? as f32,
1054        height: coordinates[3].as_f64()? as f32,
1055    })
1056}
1057
1058fn inferred_page_bbox(blocks: &[Block]) -> Option<BBox> {
1059    let mut max_x = 0.0f32;
1060    let mut max_y = 0.0f32;
1061    let mut has_bbox = false;
1062    for block in blocks {
1063        let bbox = match block {
1064            Block::Text(text) => text.bbox,
1065            Block::Table(table) => table.bbox,
1066            Block::Figure(figure) => figure.bbox,
1067        };
1068        let Some(bbox) = bbox else {
1069            continue;
1070        };
1071        has_bbox = true;
1072        max_x = max_x.max(bbox.x + bbox.width);
1073        max_y = max_y.max(bbox.y + bbox.height);
1074    }
1075
1076    has_bbox.then_some(BBox {
1077        x: 0.0,
1078        y: 0.0,
1079        width: max_x,
1080        height: max_y,
1081    })
1082}
1083
1084fn page_bbox(width: Option<f32>, height: Option<f32>) -> Option<BBox> {
1085    Some(BBox {
1086        x: 0.0,
1087        y: 0.0,
1088        width: width?,
1089        height: height?,
1090    })
1091}
1092
1093fn order_value(object: &Map<String, Value>) -> f64 {
1094    object
1095        .get("order")
1096        .and_then(Value::as_f64)
1097        .unwrap_or(f64::INFINITY)
1098}
1099
1100fn numeric_field(object: &Map<String, Value>, key: &str) -> Option<f32> {
1101    object.get(key)?.as_f64().map(|value| value as f32)
1102}
1103
1104fn bool_field(object: &Map<String, Value>, key: &str) -> bool {
1105    object.get(key).and_then(Value::as_bool).unwrap_or(false)
1106}
1107
1108fn first_string_field<'a>(object: &'a Map<String, Value>, keys: &[&str]) -> Option<&'a str> {
1109    keys.iter()
1110        .find_map(|key| object.get(*key).and_then(Value::as_str))
1111}
1112
1113fn first_numeric_field(object: &Map<String, Value>, keys: &[&str]) -> Option<f32> {
1114    keys.iter().find_map(|key| numeric_field(object, key))
1115}
1116
1117fn first_bbox_field(object: &Map<String, Value>, keys: &[&str]) -> Option<BBox> {
1118    keys.iter().find_map(|key| {
1119        object
1120            .get(*key)
1121            .and_then(|value| bbox_from_rect(value).or_else(|| bbox_from_object(value)))
1122    })
1123}
1124
1125fn bbox_from_object(value: &Value) -> Option<BBox> {
1126    let object = value.as_object()?;
1127    if let (Some(left), Some(top), Some(right), Some(bottom)) = (
1128        first_numeric_field(object, &["x1", "left", "l"]),
1129        first_numeric_field(object, &["y1", "top", "t"]),
1130        first_numeric_field(object, &["x2", "right", "r"]),
1131        first_numeric_field(object, &["y2", "bottom", "b"]),
1132    ) {
1133        return Some(BBox {
1134            x: left.min(right),
1135            y: top.min(bottom),
1136            width: (right - left).abs(),
1137            height: (bottom - top).abs(),
1138        });
1139    }
1140
1141    let x = first_numeric_field(object, &["x", "left"])?;
1142    let y = first_numeric_field(object, &["y", "top"])?;
1143    let width = first_numeric_field(object, &["width", "w"])?;
1144    let height = first_numeric_field(object, &["height", "h"])?;
1145    Some(BBox {
1146        x,
1147        y,
1148        width,
1149        height,
1150    })
1151}
1152
1153fn value_key(value: &Value) -> String {
1154    match value {
1155        Value::String(text) => text.clone(),
1156        Value::Number(number) => number.to_string(),
1157        Value::Bool(boolean) => boolean.to_string(),
1158        _ => value.to_string(),
1159    }
1160}
1161
1162fn scalar_text(value: &Value) -> Option<String> {
1163    match value {
1164        Value::String(text) => Some(clean_text(text)),
1165        Value::Number(number) => Some(number.to_string()),
1166        Value::Bool(boolean) => Some(boolean.to_string()),
1167        _ => None,
1168    }
1169    .filter(|text| !text.is_empty())
1170}
1171
1172fn clean_text(text: &str) -> String {
1173    text.split_whitespace().collect::<Vec<_>>().join(" ")
1174}
1175
1176fn normalized_kind(key: &str) -> &str {
1177    match key {
1178        "body_text" | "full_text" | "content" | "body" => "paragraph",
1179        "paragraphs" | "sections" => "paragraph",
1180        other => other,
1181    }
1182}
1183
1184fn should_recurse_generic_key(key: &str) -> bool {
1185    !matches!(
1186        key,
1187        "id" | "anno_id"
1188            | "image"
1189            | "image_path"
1190            | "pdf"
1191            | "pdf_path"
1192            | "path"
1193            | "url"
1194            | "source"
1195            | "metadata"
1196            | "page_info"
1197            | "category_type"
1198            | "attribute"
1199    )
1200}