Skip to main content

edgeparse_core/output/
legacy_json.rs

1//! Legacy-compatible JSON serializer.
2//!
3//! Produces JSON output with the legacy schema, including:
4//!   - Space-separated key names ("file name", "page number", "bounding box", ...)
5//!   - Array-style color format ("[0.0, 0.0, 0.0]" for black, "[r, g, b]" for RGB)
6//!   - Globally sequential integer IDs
7//!   - Element types: heading, paragraph, list, table, image, caption, header, footer
8//!   - BoundingBox as [left_x, bottom_y, right_x, top_y] float array
9
10use std::cell::Cell;
11
12use serde_json::{json, Map, Value};
13
14use crate::models::bbox::BoundingBox;
15use crate::models::chunks::ImageChunk;
16use crate::models::content::ContentElement;
17use crate::models::document::PdfDocument;
18use crate::models::enums::SemanticType;
19use crate::models::list::{ListItem, PDFList};
20use crate::models::semantic::{
21    SemanticCaption, SemanticFigure, SemanticHeaderOrFooter, SemanticHeading,
22    SemanticNumberHeading, SemanticParagraph, SemanticTable, SemanticTextNode,
23};
24use crate::models::table::{TableBorder, TableBorderCell, TableBorderRow, TableToken};
25use crate::EdgePdfError;
26
27// ---------------------------------------------------------------------------
28// Thread-local ID counter
29// ---------------------------------------------------------------------------
30
31thread_local! {
32    static NEXT_ID: Cell<u64> = const { Cell::new(1) };
33}
34
35fn next_id() -> u64 {
36    NEXT_ID.with(|c| {
37        let id = c.get();
38        c.set(id + 1);
39        id
40    })
41}
42
43fn reset_ids() {
44    NEXT_ID.with(|c| c.set(1));
45}
46
47// ---------------------------------------------------------------------------
48// Formatting helpers
49// ---------------------------------------------------------------------------
50
51/// Strip the PDF font subset prefix (e.g., "ATNKJE+NimbusRomNo9L-Medi" → "NimbusRomNo9L-Medi").
52fn strip_font_prefix(name: &str) -> &str {
53    // PDF subset names: exactly 6 uppercase letters followed by '+'
54    let bytes = name.as_bytes();
55    if bytes.len() > 7 && bytes[6] == b'+' && bytes[..6].iter().all(|b| b.is_ascii_uppercase()) {
56        &name[7..]
57    } else {
58        name
59    }
60}
61
62/// Format a float with explicit decimal: always with one decimal place minimum.
63/// e.g. 0.0 → "0.0", 14.0 → "14.0", 9.963 → "9.963"
64fn legacy_float_str(v: f64) -> String {
65    if v.fract() == 0.0 {
66        format!("{:.1}", v)
67    } else {
68        // Trim trailing zeros but keep at least one decimal
69        let s = format!("{}", v);
70        s
71    }
72}
73
74/// Format text color as array string.
75/// Preserves original color space components (1=Gray, 3=RGB, 4=CMYK).
76fn text_color_string(color: &Option<Vec<f64>>) -> String {
77    match color {
78        Some(components) if !components.is_empty() => {
79            let parts: Vec<String> = components.iter().map(|v| legacy_float_str(*v)).collect();
80            format!("[{}]", parts.join(", "))
81        }
82        _ => String::new(),
83    }
84}
85
86/// Build a JSON array for [left_x, bottom_y, right_x, top_y].
87fn bbox_array(bbox: &BoundingBox) -> Value {
88    json!([bbox.left_x, bbox.bottom_y, bbox.right_x, bbox.top_y])
89}
90
91/// 1-based page number from a BoundingBox.
92fn page_num(bbox: &BoundingBox) -> u32 {
93    bbox.page_number.unwrap_or(1)
94}
95
96// ---------------------------------------------------------------------------
97// Text extraction
98// ---------------------------------------------------------------------------
99
100/// Concatenated text from a SemanticTextNode (columns → blocks → lines → chunks).
101fn node_text(node: &SemanticTextNode) -> String {
102    node.value().trim().to_string()
103}
104
105/// Extract (font_name, font_size, text_color_str) from a SemanticTextNode.
106/// Falls back to the first non-empty text chunk if the node's fields are None.
107fn text_node_style(node: &SemanticTextNode) -> (String, f64, String) {
108    // Try direct fields first
109    let font_name = node.font_name.clone();
110    let font_size = node.font_size;
111    let text_color = &node.text_color;
112
113    // If any is missing, extract from the first non-empty text chunk
114    if font_name.is_none() || font_size.is_none() || text_color.is_none() {
115        if let Some(first_chunk) = node
116            .columns
117            .iter()
118            .flat_map(|c| c.text_blocks.iter())
119            .flat_map(|b| b.text_lines.iter())
120            .flat_map(|l| l.text_chunks.iter())
121            .find(|c| !c.value.trim().is_empty())
122        {
123            let raw_font = font_name
124                .as_deref()
125                .unwrap_or(first_chunk.font_name.as_str());
126            let resolved_font = strip_font_prefix(raw_font).to_string();
127            let resolved_size = font_size.unwrap_or(first_chunk.font_size);
128            let resolved_color = if text_color.is_some() {
129                text_color_string(text_color)
130            } else {
131                format_font_color(&first_chunk.font_color)
132            };
133            return (resolved_font, resolved_size, resolved_color);
134        }
135    }
136
137    (
138        font_name
139            .as_deref()
140            .map(strip_font_prefix)
141            .unwrap_or("")
142            .to_string(),
143        font_size.unwrap_or(0.0),
144        text_color_string(text_color),
145    )
146}
147
148/// Extract text from a ListItem.
149/// First tries semantic contents (preferred), then falls back to raw body tokens.
150fn list_item_text(item: &ListItem) -> String {
151    if !item.contents.is_empty() {
152        let parts: Vec<String> = item.contents.iter().filter_map(element_text_str).collect();
153        if !parts.is_empty() {
154            return parts.join(" ").trim().to_string();
155        }
156    }
157    // Fallback: raw body content tokens
158    table_tokens_text(&item.body.content)
159}
160
161/// Extract text from raw TableTokenRows.
162fn table_tokens_text(rows: &[Vec<TableToken>]) -> String {
163    rows.iter()
164        .flat_map(|row| row.iter())
165        .map(|t| t.base.value.as_str())
166        .collect::<Vec<_>>()
167        .join(" ")
168        .split_whitespace()
169        .collect::<Vec<_>>()
170        .join(" ")
171}
172
173/// Extract text string from a ContentElement if it's a text-bearing type.
174fn element_text_str(el: &ContentElement) -> Option<String> {
175    match el {
176        ContentElement::Paragraph(p) => Some(node_text(&p.base)),
177        ContentElement::Heading(h) => Some(node_text(&h.base.base)),
178        ContentElement::NumberHeading(nh) => Some(node_text(&nh.base.base.base)),
179        ContentElement::Caption(c) => Some(node_text(&c.base)),
180        // Raw text elements from list_detector and early pipeline stages
181        ContentElement::TextLine(l) => {
182            let s = l.value().trim().to_string();
183            if s.is_empty() {
184                None
185            } else {
186                Some(s)
187            }
188        }
189        ContentElement::TextBlock(b) => {
190            let s = b.value().trim().to_string();
191            if s.is_empty() {
192                None
193            } else {
194                Some(s)
195            }
196        }
197        ContentElement::TextChunk(c) => {
198            let s = c.value.trim().to_string();
199            if s.is_empty() {
200                None
201            } else {
202                Some(s)
203            }
204        }
205        _ => None,
206    }
207}
208
209/// Extract (font_name, font_size, text_color) from the first text-bearing element.
210fn list_item_style(item: &ListItem) -> (String, f64, String) {
211    // Try semantic contents first
212    for el in &item.contents {
213        if let Some((font, size, color)) = element_style(el) {
214            return (font, size, color);
215        }
216    }
217    // Try body tokens first chunk
218    let first_chunk = item.body.content.iter().flat_map(|row| row.iter()).next();
219    if let Some(token) = first_chunk {
220        let color_str = format_font_color(&token.base.font_color);
221        return (
222            strip_font_prefix(&token.base.font_name).to_string(),
223            token.base.font_size,
224            color_str,
225        );
226    }
227    (String::new(), 0.0, "[0.0, 0.0, 0.0]".to_string())
228}
229
230/// Extract style from a ContentElement.
231fn element_style(el: &ContentElement) -> Option<(String, f64, String)> {
232    match el {
233        ContentElement::Paragraph(p) => Some(text_node_style(&p.base)),
234        ContentElement::Heading(h) => Some(text_node_style(&h.base.base)),
235        ContentElement::NumberHeading(nh) => Some(text_node_style(&nh.base.base.base)),
236        ContentElement::Caption(c) => Some(text_node_style(&c.base)),
237        // Raw text elements from list_detector
238        ContentElement::TextLine(l) => {
239            if let Some(chunk) = l.text_chunks.iter().find(|c| !c.value.trim().is_empty()) {
240                let font = strip_font_prefix(&chunk.font_name).to_string();
241                let color = format_font_color(&chunk.font_color);
242                Some((font, chunk.font_size, color))
243            } else {
244                None
245            }
246        }
247        ContentElement::TextBlock(b) => {
248            if let Some(chunk) = b
249                .text_lines
250                .iter()
251                .flat_map(|l| l.text_chunks.iter())
252                .find(|c| !c.value.trim().is_empty())
253            {
254                let font = strip_font_prefix(&chunk.font_name).to_string();
255                let color = format_font_color(&chunk.font_color);
256                Some((font, chunk.font_size, color))
257            } else {
258                None
259            }
260        }
261        ContentElement::TextChunk(c) => {
262            if !c.value.trim().is_empty() {
263                let font = strip_font_prefix(&c.font_name).to_string();
264                let color = format_font_color(&c.font_color);
265                Some((font, c.font_size, color))
266            } else {
267                None
268            }
269        }
270        _ => None,
271    }
272}
273
274/// Convert a font color string to array notation.
275/// Handles both new format "[0.0, 0.0, 0.0]" (pass-through) and legacy hex "#RRGGBB".
276fn format_font_color(color: &str) -> String {
277    // New format: already in array notation
278    if color.starts_with('[') {
279        return color.to_string();
280    }
281    // Legacy hex format
282    let hex = color.trim_start_matches('#');
283    if hex.len() == 6 {
284        if let (Ok(r), Ok(g), Ok(b)) = (
285            u8::from_str_radix(&hex[0..2], 16),
286            u8::from_str_radix(&hex[2..4], 16),
287            u8::from_str_radix(&hex[4..6], 16),
288        ) {
289            let rf = r as f64 / 255.0;
290            let gf = g as f64 / 255.0;
291            let bf = b as f64 / 255.0;
292            return text_color_string(&Some(vec![rf, gf, bf]));
293        }
294    }
295    String::new()
296}
297
298// ---------------------------------------------------------------------------
299// Heading level name
300// ---------------------------------------------------------------------------
301
302/// Map a heading level integer to a named level string.
303fn heading_level_name(level: u32) -> &'static str {
304    match level {
305        1 => "Title",
306        2 => "Subtitle",
307        3 => "Heading1",
308        4 => "Heading2",
309        5 => "Heading3",
310        _ => "Heading4",
311    }
312}
313
314// ---------------------------------------------------------------------------
315// Numbering style mapping
316// ---------------------------------------------------------------------------
317
318/// Map a raw numbering style token to a human-readable description.
319fn numbering_style_label(raw: &str) -> &str {
320    let r = raw.trim();
321    if r.contains('•') || r.contains('-') || r.contains('*') || r == "bullet" || r == "–" {
322        "bullets"
323    } else if r.contains('a') || r.contains('A') || r == "letter" {
324        "letters"
325    } else if r.contains('i') || r.contains('I') || r == "roman" {
326        "roman numerals"
327    } else {
328        "arabic numbers"
329    }
330}
331
332// ---------------------------------------------------------------------------
333// Element converters
334// ---------------------------------------------------------------------------
335
336fn paragraph_to_legacy(para: &SemanticParagraph) -> Value {
337    let node = &para.base;
338    let (font, font_size, color) = text_node_style(node);
339    let mut obj = Map::new();
340    obj.insert("type".into(), json!("paragraph"));
341    obj.insert("id".into(), json!(next_id()));
342    obj.insert("page number".into(), json!(page_num(&node.bbox)));
343    obj.insert("bounding box".into(), bbox_array(&node.bbox));
344    obj.insert("font".into(), json!(font));
345    obj.insert("font size".into(), json!(font_size));
346    obj.insert("text color".into(), json!(color));
347    obj.insert("content".into(), json!(node_text(node)));
348    Value::Object(obj)
349}
350
351fn heading_to_legacy(heading: &SemanticHeading) -> Value {
352    let node = &heading.base.base;
353    let (font, font_size, color) = text_node_style(node);
354    let level_num = heading.heading_level.unwrap_or(3);
355    let level_name = heading_level_name(level_num);
356    let mut obj = Map::new();
357    obj.insert("type".into(), json!("heading"));
358    obj.insert("id".into(), json!(next_id()));
359    obj.insert("level".into(), json!(level_name));
360    obj.insert("page number".into(), json!(page_num(&node.bbox)));
361    obj.insert("bounding box".into(), bbox_array(&node.bbox));
362    obj.insert("heading level".into(), json!(level_num));
363    obj.insert("font".into(), json!(font));
364    obj.insert("font size".into(), json!(font_size));
365    obj.insert("text color".into(), json!(color));
366    obj.insert("content".into(), json!(node_text(node)));
367    Value::Object(obj)
368}
369
370fn number_heading_to_legacy(nh: &SemanticNumberHeading) -> Value {
371    heading_to_legacy(&nh.base)
372}
373
374fn caption_to_legacy(cap: &SemanticCaption) -> Value {
375    let node = &cap.base;
376    let (font, font_size, color) = text_node_style(node);
377    let mut obj = Map::new();
378    obj.insert("type".into(), json!("caption"));
379    obj.insert("id".into(), json!(next_id()));
380    obj.insert("page number".into(), json!(page_num(&node.bbox)));
381    obj.insert("bounding box".into(), bbox_array(&node.bbox));
382    if let Some(linked_id) = cap.linked_content_id {
383        obj.insert("linked content id".into(), json!(linked_id));
384    }
385    obj.insert("font".into(), json!(font));
386    obj.insert("font size".into(), json!(font_size));
387    obj.insert("text color".into(), json!(color));
388    obj.insert("content".into(), json!(node_text(node)));
389    Value::Object(obj)
390}
391
392fn header_footer_to_legacy(hf: &SemanticHeaderOrFooter, stem: &str, img_idx: &mut u64) -> Value {
393    let type_str = if hf.semantic_type == SemanticType::Header {
394        "header"
395    } else {
396        "footer"
397    };
398    let kids: Vec<Value> = hf
399        .contents
400        .iter()
401        .flat_map(|el| elements_to_legacy(el, stem, img_idx))
402        .collect();
403    let mut obj = Map::new();
404    obj.insert("type".into(), json!(type_str));
405    obj.insert("id".into(), json!(next_id()));
406    obj.insert("page number".into(), json!(page_num(&hf.bbox)));
407    obj.insert("bounding box".into(), bbox_array(&hf.bbox));
408    obj.insert("kids".into(), json!(kids));
409    Value::Object(obj)
410}
411
412fn image_to_legacy(img: &ImageChunk, stem: &str, img_idx: &mut u64) -> Value {
413    *img_idx += 1;
414    let source = format!("{}_images/imageFile{}.png", stem, img_idx);
415    let mut obj = Map::new();
416    obj.insert("type".into(), json!("image"));
417    obj.insert("id".into(), json!(next_id()));
418    obj.insert("page number".into(), json!(page_num(&img.bbox)));
419    obj.insert("bounding box".into(), bbox_array(&img.bbox));
420    obj.insert("source".into(), json!(source));
421    Value::Object(obj)
422}
423
424fn figure_to_legacy(fig: &SemanticFigure, stem: &str, img_idx: &mut u64) -> Vec<Value> {
425    if fig.images.is_empty() {
426        // No embedded images — emit a single image using the figure bbox
427        *img_idx += 1;
428        let source = format!("{}_images/imageFile{}.png", stem, img_idx);
429        let mut obj = Map::new();
430        obj.insert("type".into(), json!("image"));
431        obj.insert("id".into(), json!(next_id()));
432        obj.insert("page number".into(), json!(page_num(&fig.bbox)));
433        obj.insert("bounding box".into(), bbox_array(&fig.bbox));
434        obj.insert("source".into(), json!(source));
435        vec![Value::Object(obj)]
436    } else {
437        // Each image chunk uses the figure's page bbox (since ImageChunk bbox is pixel-space)
438        fig.images
439            .iter()
440            .map(|_img| {
441                *img_idx += 1;
442                let source = format!("{}_images/imageFile{}.png", stem, img_idx);
443                let mut obj = Map::new();
444                obj.insert("type".into(), json!("image"));
445                obj.insert("id".into(), json!(next_id()));
446                obj.insert("page number".into(), json!(page_num(&fig.bbox)));
447                obj.insert("bounding box".into(), bbox_array(&fig.bbox));
448                obj.insert("source".into(), json!(source));
449                Value::Object(obj)
450            })
451            .collect()
452    }
453}
454
455fn list_item_to_legacy(item: &ListItem, stem: &str, img_idx: &mut u64) -> Value {
456    let (font, font_size, color) = list_item_style(item);
457    let text = list_item_text(item);
458    // Nested list items go in kids
459    let kids: Vec<Value> = item
460        .contents
461        .iter()
462        .filter(|e| matches!(e, ContentElement::List(_)))
463        .flat_map(|el| elements_to_legacy(el, stem, img_idx))
464        .collect();
465    let mut obj = Map::new();
466    obj.insert("type".into(), json!("list item"));
467    obj.insert("id".into(), json!(next_id()));
468    obj.insert("page number".into(), json!(page_num(&item.bbox)));
469    obj.insert("bounding box".into(), bbox_array(&item.bbox));
470    obj.insert("font".into(), json!(font));
471    obj.insert("font size".into(), json!(font_size));
472    obj.insert("text color".into(), json!(color));
473    obj.insert("content".into(), json!(text));
474    obj.insert("kids".into(), json!(kids));
475    Value::Object(obj)
476}
477
478fn list_to_legacy(list: &PDFList, stem: &str, img_idx: &mut u64) -> Value {
479    let numbering = list
480        .numbering_style
481        .as_deref()
482        .map(numbering_style_label)
483        .unwrap_or("arabic numbers");
484    let num_items = list.list_items.len();
485    let next_list_id_val = list.next_list_id.unwrap_or(0);
486    let prev_list_id_val = list.previous_list_id.unwrap_or(0);
487    // Use "1" as the default nesting level (convention for top-level lists)
488    let level = "1".to_string();
489
490    let list_items: Vec<Value> = list
491        .list_items
492        .iter()
493        .map(|item| list_item_to_legacy(item, stem, img_idx))
494        .collect();
495
496    let mut obj = Map::new();
497    obj.insert("type".into(), json!("list"));
498    obj.insert("id".into(), json!(next_id()));
499    obj.insert("level".into(), json!(level));
500    obj.insert("page number".into(), json!(page_num(&list.bbox)));
501    obj.insert("bounding box".into(), bbox_array(&list.bbox));
502    obj.insert("numbering style".into(), json!(numbering));
503    obj.insert("number of list items".into(), json!(num_items));
504    obj.insert("next list id".into(), json!(next_list_id_val));
505    obj.insert("previous list id".into(), json!(prev_list_id_val));
506    obj.insert("list items".into(), json!(list_items));
507    Value::Object(obj)
508}
509
510fn table_cell_to_legacy(cell: &TableBorderCell, stem: &str, img_idx: &mut u64) -> Value {
511    let kids: Vec<Value> = cell
512        .contents
513        .iter()
514        .flat_map(|el| elements_to_legacy(el, stem, img_idx))
515        .collect();
516    let mut obj = Map::new();
517    obj.insert("type".into(), json!("table cell"));
518    obj.insert("page number".into(), json!(page_num(&cell.bbox)));
519    obj.insert("bounding box".into(), bbox_array(&cell.bbox));
520    obj.insert("row number".into(), json!(cell.row_number + 1));
521    obj.insert("column number".into(), json!(cell.col_number + 1));
522    obj.insert("row span".into(), json!(cell.row_span));
523    obj.insert("column span".into(), json!(cell.col_span));
524    obj.insert("kids".into(), json!(kids));
525    Value::Object(obj)
526}
527
528fn table_row_to_legacy(row: &TableBorderRow, stem: &str, img_idx: &mut u64) -> Value {
529    let cells: Vec<Value> = row
530        .cells
531        .iter()
532        .map(|cell| table_cell_to_legacy(cell, stem, img_idx))
533        .collect();
534    let mut obj = Map::new();
535    obj.insert("type".into(), json!("table row"));
536    obj.insert("row number".into(), json!(row.row_number + 1));
537    obj.insert("cells".into(), json!(cells));
538    Value::Object(obj)
539}
540
541/// Check whether a detected TableBorder is likely a false positive.
542///
543/// The reference table detector uses tagged-PDF structure, so it
544/// avoids outputting container rectangles, thin rule lines, and other
545/// geometric noise that the geometric border detector picks up.  We apply
546/// three conservative heuristic filters to match the reference behaviour:
547///
548/// 1. **Large single-cell table**: a 1×1 grid whose bounding box is much
549///    bigger than a typical checkbox marker (≈15×8 pt) is a container rect.
550/// 2. **Small multi-cell table**: a multi-cell grid whose height is < 30 pt
551///    is almost certainly noise inside a figure or code block.
552/// 3. **Thin horizontal strip**: width/height ratio > 8 with height < 25 pt
553///    indicates a horizontal rule that accidentally forms a closed rectangle.
554fn is_false_positive_table(tb: &TableBorder) -> bool {
555    let width = tb.bbox.right_x - tb.bbox.left_x;
556    let height = tb.bbox.top_y - tb.bbox.bottom_y;
557    let num_cells = tb.num_rows * tb.num_columns;
558
559    // Rule 1: Large single-cell container
560    if num_cells == 1 && (width > 25.0 || height > 14.5) {
561        return true;
562    }
563    // Rule 2: Small multi-cell table (noise from figures/code blocks)
564    if num_cells > 1 && height < 30.0 {
565        return true;
566    }
567    // Rule 3: Thin horizontal strip (horizontal rule detected as table)
568    if height > 0.0 && width / height > 8.0 && height < 25.0 {
569        return true;
570    }
571    false
572}
573
574fn table_border_to_legacy(tb: &TableBorder, stem: &str, img_idx: &mut u64) -> Option<Value> {
575    if is_false_positive_table(tb) {
576        return None;
577    }
578
579    let level = tb.level.clone().unwrap_or_else(|| "0".to_string());
580    let next_table_id: u64 = 0; // Rust doesn't have next_table_id linkage in schema
581    let rows: Vec<Value> = tb
582        .rows
583        .iter()
584        .map(|row| table_row_to_legacy(row, stem, img_idx))
585        .collect();
586
587    let mut obj = Map::new();
588    obj.insert("type".into(), json!("table"));
589    obj.insert("id".into(), json!(next_id()));
590    obj.insert("level".into(), json!(level));
591    obj.insert("page number".into(), json!(page_num(&tb.bbox)));
592    obj.insert("bounding box".into(), bbox_array(&tb.bbox));
593    obj.insert("number of rows".into(), json!(tb.num_rows));
594    obj.insert("number of columns".into(), json!(tb.num_columns));
595    obj.insert("next table id".into(), json!(next_table_id));
596    obj.insert("rows".into(), json!(rows));
597    Some(Value::Object(obj))
598}
599
600fn semantic_table_to_legacy(st: &SemanticTable, stem: &str, img_idx: &mut u64) -> Option<Value> {
601    table_border_to_legacy(&st.table_border, stem, img_idx)
602}
603
604/// Convert a Paragraph marked as Caption to legacy JSON caption format.
605fn paragraph_as_caption_to_legacy(para: &SemanticParagraph) -> Value {
606    let node = &para.base;
607    let (font, font_size, color) = text_node_style(node);
608    let mut obj = Map::new();
609    obj.insert("type".into(), json!("caption"));
610    obj.insert("id".into(), json!(next_id()));
611    obj.insert("page number".into(), json!(page_num(&node.bbox)));
612    obj.insert("bounding box".into(), bbox_array(&node.bbox));
613    obj.insert("font".into(), json!(font));
614    obj.insert("font size".into(), json!(font_size));
615    obj.insert("text color".into(), json!(color));
616    obj.insert("content".into(), json!(node_text(node)));
617    Value::Object(obj)
618}
619
620/// Convert a Heading marked as Caption to legacy JSON caption format.
621fn heading_as_caption_to_legacy(heading: &SemanticHeading) -> Value {
622    let node = &heading.base.base;
623    let (font, font_size, color) = text_node_style(node);
624    let mut obj = Map::new();
625    obj.insert("type".into(), json!("caption"));
626    obj.insert("id".into(), json!(next_id()));
627    obj.insert("page number".into(), json!(page_num(&node.bbox)));
628    obj.insert("bounding box".into(), bbox_array(&node.bbox));
629    obj.insert("font".into(), json!(font));
630    obj.insert("font size".into(), json!(font_size));
631    obj.insert("text color".into(), json!(color));
632    obj.insert("content".into(), json!(node_text(node)));
633    Value::Object(obj)
634}
635
636// ---------------------------------------------------------------------------
637// Dispatcher: ContentElement → Vec<Value>
638// ---------------------------------------------------------------------------
639
640/// Convert a single ContentElement to zero or more legacy JSON values.
641/// Returns a Vec because a Figure may expand to multiple images.
642fn elements_to_legacy(el: &ContentElement, stem: &str, img_idx: &mut u64) -> Vec<Value> {
643    match el {
644        ContentElement::Paragraph(p) => {
645            // Skip empty/whitespace-only paragraphs (never emitted in legacy output).
646            if p.base.is_empty() {
647                return vec![];
648            }
649            // Check if this paragraph has been marked as a Caption by caption_linker
650            if p.base.semantic_type == SemanticType::Caption {
651                vec![paragraph_as_caption_to_legacy(p)]
652            } else {
653                vec![paragraph_to_legacy(p)]
654            }
655        }
656        ContentElement::Heading(h) => {
657            // Skip empty/whitespace-only headings.
658            if h.base.base.is_empty() {
659                return vec![];
660            }
661            // Check if heading was marked as Caption
662            if h.base.base.semantic_type == SemanticType::Caption {
663                vec![heading_as_caption_to_legacy(h)]
664            } else {
665                vec![heading_to_legacy(h)]
666            }
667        }
668        ContentElement::NumberHeading(nh) => vec![number_heading_to_legacy(nh)],
669        ContentElement::Caption(c) => vec![caption_to_legacy(c)],
670        ContentElement::HeaderFooter(hf) => vec![header_footer_to_legacy(hf, stem, img_idx)],
671        ContentElement::Figure(fig) => figure_to_legacy(fig, stem, img_idx),
672        ContentElement::Image(img) => vec![image_to_legacy(img, stem, img_idx)],
673        ContentElement::List(l) => vec![list_to_legacy(l, stem, img_idx)],
674        ContentElement::Table(st) => semantic_table_to_legacy(st, stem, img_idx)
675            .into_iter()
676            .collect(),
677        ContentElement::TableBorder(tb) => table_border_to_legacy(tb, stem, img_idx)
678            .into_iter()
679            .collect(),
680        // Skip raw low-level elements not exposed in legacy output
681        ContentElement::TextChunk(_)
682        | ContentElement::TextLine(_)
683        | ContentElement::TextBlock(_)
684        | ContentElement::Line(_)
685        | ContentElement::LineArt(_)
686        | ContentElement::Formula(_)
687        | ContentElement::Picture(_) => vec![],
688    }
689}
690
691// ---------------------------------------------------------------------------
692// Top-level serializer
693// ---------------------------------------------------------------------------
694
695/// Convert a PdfDocument to a legacy-schema JSON `Value`.
696///
697/// `stem` is the PDF file stem (without extension), used for constructing
698/// image source paths like `"{stem}_images/imageFile1.png"`.
699pub fn to_legacy_json_value(doc: &PdfDocument, stem: &str) -> Value {
700    reset_ids();
701    let mut img_idx: u64 = 0;
702
703    let kids: Vec<Value> = doc
704        .kids
705        .iter()
706        .flat_map(|el| elements_to_legacy(el, stem, &mut img_idx))
707        .collect();
708
709    let mut obj = Map::new();
710    obj.insert("file name".into(), json!(doc.file_name));
711    obj.insert("number of pages".into(), json!(doc.number_of_pages));
712    obj.insert(
713        "author".into(),
714        doc.author.as_deref().map_or(Value::Null, |s| json!(s)),
715    );
716    obj.insert(
717        "title".into(),
718        doc.title.as_deref().map_or(Value::Null, |s| json!(s)),
719    );
720    obj.insert(
721        "creation date".into(),
722        doc.creation_date
723            .as_deref()
724            .map_or(Value::Null, |s| json!(s)),
725    );
726    obj.insert(
727        "modification date".into(),
728        doc.modification_date
729            .as_deref()
730            .map_or(Value::Null, |s| json!(s)),
731    );
732    obj.insert("kids".into(), json!(kids));
733
734    Value::Object(obj)
735}
736
737/// Serialize a PdfDocument to a legacy-compatible JSON string.
738///
739/// # Errors
740/// Returns `EdgePdfError::OutputError` if serialization fails.
741pub fn to_legacy_json_string(doc: &PdfDocument, stem: &str) -> Result<String, EdgePdfError> {
742    let value = to_legacy_json_value(doc, stem);
743    serde_json::to_string_pretty(&value)
744        .map_err(|e| EdgePdfError::OutputError(format!("Legacy JSON serialization failed: {}", e)))
745}
746
747// ---------------------------------------------------------------------------
748// Tests
749// ---------------------------------------------------------------------------
750
751#[cfg(test)]
752mod tests {
753    use super::*;
754    use crate::models::bbox::BoundingBox;
755    use crate::models::enums::SemanticType;
756    use crate::models::semantic::{SemanticParagraph, SemanticTextNode};
757    use crate::models::text::TextColumn;
758
759    fn make_bbox(page: u32, left: f64, bottom: f64, right: f64, top: f64) -> BoundingBox {
760        BoundingBox::new(Some(page), left, bottom, right, top)
761    }
762
763    fn make_text_node(bbox: BoundingBox, text: &str) -> SemanticTextNode {
764        use crate::models::chunks::TextChunk;
765        use crate::models::enums::{PdfLayer, TextFormat, TextType};
766        use crate::models::text::{TextBlock, TextLine};
767        let chunk = TextChunk {
768            value: text.to_string(),
769            bbox: bbox.clone(),
770            font_name: "TestFont".to_string(),
771            font_size: 12.0,
772            font_weight: 400.0,
773            italic_angle: 0.0,
774            font_color: "#000000".to_string(),
775            contrast_ratio: 21.0,
776            symbol_ends: vec![],
777            text_format: TextFormat::Normal,
778            text_type: TextType::Regular,
779            pdf_layer: PdfLayer::Main,
780            ocg_visible: true,
781            index: None,
782            page_number: Some(1),
783            level: None,
784            mcid: None,
785        };
786        let line = TextLine {
787            bbox: bbox.clone(),
788            index: None,
789            level: None,
790            font_size: 12.0,
791            base_line: 2.0,
792            slant_degree: 0.0,
793            is_hidden_text: false,
794            text_chunks: vec![chunk],
795            is_line_start: true,
796            is_line_end: true,
797            is_list_line: false,
798            connected_line_art_label: None,
799        };
800        let block = TextBlock {
801            bbox: bbox.clone(),
802            index: None,
803            level: None,
804            font_size: 12.0,
805            base_line: 2.0,
806            slant_degree: 0.0,
807            is_hidden_text: false,
808            text_lines: vec![line],
809            has_start_line: true,
810            has_end_line: true,
811            text_alignment: None,
812        };
813        let col = TextColumn {
814            bbox: bbox.clone(),
815            index: None,
816            level: None,
817            font_size: 12.0,
818            base_line: 2.0,
819            slant_degree: 0.0,
820            is_hidden_text: false,
821            text_blocks: vec![block],
822        };
823        SemanticTextNode {
824            bbox,
825            index: None,
826            level: None,
827            semantic_type: SemanticType::Paragraph,
828            correct_semantic_score: None,
829            columns: vec![col],
830            font_weight: Some(400.0),
831            font_size: Some(12.0),
832            text_color: Some(vec![0.0, 0.0, 0.0]),
833            italic_angle: None,
834            font_name: Some("TestFont".to_string()),
835            text_format: None,
836            max_font_size: None,
837            background_color: None,
838            is_hidden_text: false,
839        }
840    }
841
842    #[test]
843    fn test_empty_document() {
844        let doc = PdfDocument::new("test.pdf".to_string());
845        let json = to_legacy_json_string(&doc, "test").unwrap();
846        assert!(json.contains("\"file name\""));
847        assert!(json.contains("\"number of pages\""));
848        assert!(json.contains("\"kids\""));
849        assert!(!json.contains("number_of_pages"));
850    }
851
852    #[test]
853    fn test_paragraph_has_legacy_keys() {
854        let bbox = make_bbox(1, 54.0, 100.0, 300.0, 120.0);
855        let node = make_text_node(bbox, "Hello world");
856        let para = SemanticParagraph {
857            base: node,
858            enclosed_top: false,
859            enclosed_bottom: false,
860            indentation: 0,
861        };
862        let val = paragraph_to_legacy(&para);
863        let s = serde_json::to_string_pretty(&val).unwrap();
864        assert!(s.contains("\"type\""));
865        assert!(s.contains("\"page number\""));
866        assert!(s.contains("\"bounding box\""));
867        assert!(s.contains("\"font size\""));
868        assert!(s.contains("\"text color\""));
869        assert!(s.contains("\"content\""));
870        assert!(s.contains("\"paragraph\""));
871        assert!(s.contains("[0.0, 0.0, 0.0]"));
872    }
873
874    #[test]
875    fn test_text_color_grayscale() {
876        assert_eq!(
877            text_color_string(&Some(vec![0.0, 0.0, 0.0])),
878            "[0.0, 0.0, 0.0]"
879        );
880        assert_eq!(
881            text_color_string(&Some(vec![1.0, 1.0, 1.0])),
882            "[1.0, 1.0, 1.0]"
883        );
884        assert_eq!(text_color_string(&None), "");
885    }
886
887    #[test]
888    fn test_text_color_rgb() {
889        let result = text_color_string(&Some(vec![1.0, 0.0, 0.0]));
890        assert!(result.contains("1.0") && result.contains("0.0"));
891        assert!(result.starts_with('[') && result.ends_with(']'));
892    }
893
894    #[test]
895    fn test_bbox_array_order() {
896        let bbox = make_bbox(1, 10.0, 20.0, 300.0, 400.0);
897        let arr = bbox_array(&bbox);
898        if let Value::Array(v) = arr {
899            // [left_x, bottom_y, right_x, top_y]
900            assert_eq!(v[0].as_f64().unwrap(), 10.0);
901            assert_eq!(v[1].as_f64().unwrap(), 20.0);
902            assert_eq!(v[2].as_f64().unwrap(), 300.0);
903            assert_eq!(v[3].as_f64().unwrap(), 400.0);
904        } else {
905            panic!("Expected array");
906        }
907    }
908}