1use std::cell::Cell;
11
12use serde_json::{json, Map, Value};
13
14use crate::models::bbox::BoundingBox;
15use crate::models::chunks::ImageChunk;
16use crate::models::content::ContentElement;
17use crate::models::document::PdfDocument;
18use crate::models::enums::SemanticType;
19use crate::models::list::{ListItem, PDFList};
20use crate::models::semantic::{
21 SemanticCaption, SemanticFigure, SemanticHeaderOrFooter, SemanticHeading,
22 SemanticNumberHeading, SemanticParagraph, SemanticTable, SemanticTextNode,
23};
24use crate::models::table::{TableBorder, TableBorderCell, TableBorderRow, TableToken};
25use crate::EdgePdfError;
26
27thread_local! {
32 static NEXT_ID: Cell<u64> = const { Cell::new(1) };
33}
34
35fn next_id() -> u64 {
36 NEXT_ID.with(|c| {
37 let id = c.get();
38 c.set(id + 1);
39 id
40 })
41}
42
43fn reset_ids() {
44 NEXT_ID.with(|c| c.set(1));
45}
46
47fn strip_font_prefix(name: &str) -> &str {
53 let bytes = name.as_bytes();
55 if bytes.len() > 7 && bytes[6] == b'+' && bytes[..6].iter().all(|b| b.is_ascii_uppercase()) {
56 &name[7..]
57 } else {
58 name
59 }
60}
61
62fn legacy_float_str(v: f64) -> String {
65 if v.fract() == 0.0 {
66 format!("{:.1}", v)
67 } else {
68 let s = format!("{}", v);
70 s
71 }
72}
73
74fn text_color_string(color: &Option<Vec<f64>>) -> String {
77 match color {
78 Some(components) if !components.is_empty() => {
79 let parts: Vec<String> = components.iter().map(|v| legacy_float_str(*v)).collect();
80 format!("[{}]", parts.join(", "))
81 }
82 _ => String::new(),
83 }
84}
85
86fn bbox_array(bbox: &BoundingBox) -> Value {
88 json!([bbox.left_x, bbox.bottom_y, bbox.right_x, bbox.top_y])
89}
90
91fn page_num(bbox: &BoundingBox) -> u32 {
93 bbox.page_number.unwrap_or(1)
94}
95
96fn node_text(node: &SemanticTextNode) -> String {
102 node.value().trim().to_string()
103}
104
105fn text_node_style(node: &SemanticTextNode) -> (String, f64, String) {
108 let font_name = node.font_name.clone();
110 let font_size = node.font_size;
111 let text_color = &node.text_color;
112
113 if font_name.is_none() || font_size.is_none() || text_color.is_none() {
115 if let Some(first_chunk) = node
116 .columns
117 .iter()
118 .flat_map(|c| c.text_blocks.iter())
119 .flat_map(|b| b.text_lines.iter())
120 .flat_map(|l| l.text_chunks.iter())
121 .find(|c| !c.value.trim().is_empty())
122 {
123 let raw_font = font_name
124 .as_deref()
125 .unwrap_or(first_chunk.font_name.as_str());
126 let resolved_font = strip_font_prefix(raw_font).to_string();
127 let resolved_size = font_size.unwrap_or(first_chunk.font_size);
128 let resolved_color = if text_color.is_some() {
129 text_color_string(text_color)
130 } else {
131 format_font_color(&first_chunk.font_color)
132 };
133 return (resolved_font, resolved_size, resolved_color);
134 }
135 }
136
137 (
138 font_name
139 .as_deref()
140 .map(strip_font_prefix)
141 .unwrap_or("")
142 .to_string(),
143 font_size.unwrap_or(0.0),
144 text_color_string(text_color),
145 )
146}
147
148fn list_item_text(item: &ListItem) -> String {
151 if !item.contents.is_empty() {
152 let parts: Vec<String> = item.contents.iter().filter_map(element_text_str).collect();
153 if !parts.is_empty() {
154 return parts.join(" ").trim().to_string();
155 }
156 }
157 table_tokens_text(&item.body.content)
159}
160
161fn table_tokens_text(rows: &[Vec<TableToken>]) -> String {
163 rows.iter()
164 .flat_map(|row| row.iter())
165 .map(|t| t.base.value.as_str())
166 .collect::<Vec<_>>()
167 .join(" ")
168 .split_whitespace()
169 .collect::<Vec<_>>()
170 .join(" ")
171}
172
173fn element_text_str(el: &ContentElement) -> Option<String> {
175 match el {
176 ContentElement::Paragraph(p) => Some(node_text(&p.base)),
177 ContentElement::Heading(h) => Some(node_text(&h.base.base)),
178 ContentElement::NumberHeading(nh) => Some(node_text(&nh.base.base.base)),
179 ContentElement::Caption(c) => Some(node_text(&c.base)),
180 ContentElement::TextLine(l) => {
182 let s = l.value().trim().to_string();
183 if s.is_empty() {
184 None
185 } else {
186 Some(s)
187 }
188 }
189 ContentElement::TextBlock(b) => {
190 let s = b.value().trim().to_string();
191 if s.is_empty() {
192 None
193 } else {
194 Some(s)
195 }
196 }
197 ContentElement::TextChunk(c) => {
198 let s = c.value.trim().to_string();
199 if s.is_empty() {
200 None
201 } else {
202 Some(s)
203 }
204 }
205 _ => None,
206 }
207}
208
209fn list_item_style(item: &ListItem) -> (String, f64, String) {
211 for el in &item.contents {
213 if let Some((font, size, color)) = element_style(el) {
214 return (font, size, color);
215 }
216 }
217 let first_chunk = item.body.content.iter().flat_map(|row| row.iter()).next();
219 if let Some(token) = first_chunk {
220 let color_str = format_font_color(&token.base.font_color);
221 return (
222 strip_font_prefix(&token.base.font_name).to_string(),
223 token.base.font_size,
224 color_str,
225 );
226 }
227 (String::new(), 0.0, "[0.0, 0.0, 0.0]".to_string())
228}
229
230fn element_style(el: &ContentElement) -> Option<(String, f64, String)> {
232 match el {
233 ContentElement::Paragraph(p) => Some(text_node_style(&p.base)),
234 ContentElement::Heading(h) => Some(text_node_style(&h.base.base)),
235 ContentElement::NumberHeading(nh) => Some(text_node_style(&nh.base.base.base)),
236 ContentElement::Caption(c) => Some(text_node_style(&c.base)),
237 ContentElement::TextLine(l) => {
239 if let Some(chunk) = l.text_chunks.iter().find(|c| !c.value.trim().is_empty()) {
240 let font = strip_font_prefix(&chunk.font_name).to_string();
241 let color = format_font_color(&chunk.font_color);
242 Some((font, chunk.font_size, color))
243 } else {
244 None
245 }
246 }
247 ContentElement::TextBlock(b) => {
248 if let Some(chunk) = b
249 .text_lines
250 .iter()
251 .flat_map(|l| l.text_chunks.iter())
252 .find(|c| !c.value.trim().is_empty())
253 {
254 let font = strip_font_prefix(&chunk.font_name).to_string();
255 let color = format_font_color(&chunk.font_color);
256 Some((font, chunk.font_size, color))
257 } else {
258 None
259 }
260 }
261 ContentElement::TextChunk(c) => {
262 if !c.value.trim().is_empty() {
263 let font = strip_font_prefix(&c.font_name).to_string();
264 let color = format_font_color(&c.font_color);
265 Some((font, c.font_size, color))
266 } else {
267 None
268 }
269 }
270 _ => None,
271 }
272}
273
274fn format_font_color(color: &str) -> String {
277 if color.starts_with('[') {
279 return color.to_string();
280 }
281 let hex = color.trim_start_matches('#');
283 if hex.len() == 6 {
284 if let (Ok(r), Ok(g), Ok(b)) = (
285 u8::from_str_radix(&hex[0..2], 16),
286 u8::from_str_radix(&hex[2..4], 16),
287 u8::from_str_radix(&hex[4..6], 16),
288 ) {
289 let rf = r as f64 / 255.0;
290 let gf = g as f64 / 255.0;
291 let bf = b as f64 / 255.0;
292 return text_color_string(&Some(vec![rf, gf, bf]));
293 }
294 }
295 String::new()
296}
297
298fn heading_level_name(level: u32) -> &'static str {
304 match level {
305 1 => "Title",
306 2 => "Subtitle",
307 3 => "Heading1",
308 4 => "Heading2",
309 5 => "Heading3",
310 _ => "Heading4",
311 }
312}
313
314fn numbering_style_label(raw: &str) -> &str {
320 let r = raw.trim();
321 if r.contains('•') || r.contains('-') || r.contains('*') || r == "bullet" || r == "–" {
322 "bullets"
323 } else if r.contains('a') || r.contains('A') || r == "letter" {
324 "letters"
325 } else if r.contains('i') || r.contains('I') || r == "roman" {
326 "roman numerals"
327 } else {
328 "arabic numbers"
329 }
330}
331
332fn paragraph_to_legacy(para: &SemanticParagraph) -> Value {
337 let node = ¶.base;
338 let (font, font_size, color) = text_node_style(node);
339 let mut obj = Map::new();
340 obj.insert("type".into(), json!("paragraph"));
341 obj.insert("id".into(), json!(next_id()));
342 obj.insert("page number".into(), json!(page_num(&node.bbox)));
343 obj.insert("bounding box".into(), bbox_array(&node.bbox));
344 obj.insert("font".into(), json!(font));
345 obj.insert("font size".into(), json!(font_size));
346 obj.insert("text color".into(), json!(color));
347 obj.insert("content".into(), json!(node_text(node)));
348 Value::Object(obj)
349}
350
351fn heading_to_legacy(heading: &SemanticHeading) -> Value {
352 let node = &heading.base.base;
353 let (font, font_size, color) = text_node_style(node);
354 let level_num = heading.heading_level.unwrap_or(3);
355 let level_name = heading_level_name(level_num);
356 let mut obj = Map::new();
357 obj.insert("type".into(), json!("heading"));
358 obj.insert("id".into(), json!(next_id()));
359 obj.insert("level".into(), json!(level_name));
360 obj.insert("page number".into(), json!(page_num(&node.bbox)));
361 obj.insert("bounding box".into(), bbox_array(&node.bbox));
362 obj.insert("heading level".into(), json!(level_num));
363 obj.insert("font".into(), json!(font));
364 obj.insert("font size".into(), json!(font_size));
365 obj.insert("text color".into(), json!(color));
366 obj.insert("content".into(), json!(node_text(node)));
367 Value::Object(obj)
368}
369
370fn number_heading_to_legacy(nh: &SemanticNumberHeading) -> Value {
371 heading_to_legacy(&nh.base)
372}
373
374fn caption_to_legacy(cap: &SemanticCaption) -> Value {
375 let node = &cap.base;
376 let (font, font_size, color) = text_node_style(node);
377 let mut obj = Map::new();
378 obj.insert("type".into(), json!("caption"));
379 obj.insert("id".into(), json!(next_id()));
380 obj.insert("page number".into(), json!(page_num(&node.bbox)));
381 obj.insert("bounding box".into(), bbox_array(&node.bbox));
382 if let Some(linked_id) = cap.linked_content_id {
383 obj.insert("linked content id".into(), json!(linked_id));
384 }
385 obj.insert("font".into(), json!(font));
386 obj.insert("font size".into(), json!(font_size));
387 obj.insert("text color".into(), json!(color));
388 obj.insert("content".into(), json!(node_text(node)));
389 Value::Object(obj)
390}
391
392fn header_footer_to_legacy(hf: &SemanticHeaderOrFooter, stem: &str, img_idx: &mut u64) -> Value {
393 let type_str = if hf.semantic_type == SemanticType::Header {
394 "header"
395 } else {
396 "footer"
397 };
398 let kids: Vec<Value> = hf
399 .contents
400 .iter()
401 .flat_map(|el| elements_to_legacy(el, stem, img_idx))
402 .collect();
403 let mut obj = Map::new();
404 obj.insert("type".into(), json!(type_str));
405 obj.insert("id".into(), json!(next_id()));
406 obj.insert("page number".into(), json!(page_num(&hf.bbox)));
407 obj.insert("bounding box".into(), bbox_array(&hf.bbox));
408 obj.insert("kids".into(), json!(kids));
409 Value::Object(obj)
410}
411
412fn image_to_legacy(img: &ImageChunk, stem: &str, img_idx: &mut u64) -> Value {
413 *img_idx += 1;
414 let source = format!("{}_images/imageFile{}.png", stem, img_idx);
415 let mut obj = Map::new();
416 obj.insert("type".into(), json!("image"));
417 obj.insert("id".into(), json!(next_id()));
418 obj.insert("page number".into(), json!(page_num(&img.bbox)));
419 obj.insert("bounding box".into(), bbox_array(&img.bbox));
420 obj.insert("source".into(), json!(source));
421 Value::Object(obj)
422}
423
424fn figure_to_legacy(fig: &SemanticFigure, stem: &str, img_idx: &mut u64) -> Vec<Value> {
425 if fig.images.is_empty() {
426 *img_idx += 1;
428 let source = format!("{}_images/imageFile{}.png", stem, img_idx);
429 let mut obj = Map::new();
430 obj.insert("type".into(), json!("image"));
431 obj.insert("id".into(), json!(next_id()));
432 obj.insert("page number".into(), json!(page_num(&fig.bbox)));
433 obj.insert("bounding box".into(), bbox_array(&fig.bbox));
434 obj.insert("source".into(), json!(source));
435 vec![Value::Object(obj)]
436 } else {
437 fig.images
439 .iter()
440 .map(|_img| {
441 *img_idx += 1;
442 let source = format!("{}_images/imageFile{}.png", stem, img_idx);
443 let mut obj = Map::new();
444 obj.insert("type".into(), json!("image"));
445 obj.insert("id".into(), json!(next_id()));
446 obj.insert("page number".into(), json!(page_num(&fig.bbox)));
447 obj.insert("bounding box".into(), bbox_array(&fig.bbox));
448 obj.insert("source".into(), json!(source));
449 Value::Object(obj)
450 })
451 .collect()
452 }
453}
454
455fn list_item_to_legacy(item: &ListItem, stem: &str, img_idx: &mut u64) -> Value {
456 let (font, font_size, color) = list_item_style(item);
457 let text = list_item_text(item);
458 let kids: Vec<Value> = item
460 .contents
461 .iter()
462 .filter(|e| matches!(e, ContentElement::List(_)))
463 .flat_map(|el| elements_to_legacy(el, stem, img_idx))
464 .collect();
465 let mut obj = Map::new();
466 obj.insert("type".into(), json!("list item"));
467 obj.insert("id".into(), json!(next_id()));
468 obj.insert("page number".into(), json!(page_num(&item.bbox)));
469 obj.insert("bounding box".into(), bbox_array(&item.bbox));
470 obj.insert("font".into(), json!(font));
471 obj.insert("font size".into(), json!(font_size));
472 obj.insert("text color".into(), json!(color));
473 obj.insert("content".into(), json!(text));
474 obj.insert("kids".into(), json!(kids));
475 Value::Object(obj)
476}
477
478fn list_to_legacy(list: &PDFList, stem: &str, img_idx: &mut u64) -> Value {
479 let numbering = list
480 .numbering_style
481 .as_deref()
482 .map(numbering_style_label)
483 .unwrap_or("arabic numbers");
484 let num_items = list.list_items.len();
485 let next_list_id_val = list.next_list_id.unwrap_or(0);
486 let prev_list_id_val = list.previous_list_id.unwrap_or(0);
487 let level = "1".to_string();
489
490 let list_items: Vec<Value> = list
491 .list_items
492 .iter()
493 .map(|item| list_item_to_legacy(item, stem, img_idx))
494 .collect();
495
496 let mut obj = Map::new();
497 obj.insert("type".into(), json!("list"));
498 obj.insert("id".into(), json!(next_id()));
499 obj.insert("level".into(), json!(level));
500 obj.insert("page number".into(), json!(page_num(&list.bbox)));
501 obj.insert("bounding box".into(), bbox_array(&list.bbox));
502 obj.insert("numbering style".into(), json!(numbering));
503 obj.insert("number of list items".into(), json!(num_items));
504 obj.insert("next list id".into(), json!(next_list_id_val));
505 obj.insert("previous list id".into(), json!(prev_list_id_val));
506 obj.insert("list items".into(), json!(list_items));
507 Value::Object(obj)
508}
509
510fn table_cell_to_legacy(cell: &TableBorderCell, stem: &str, img_idx: &mut u64) -> Value {
511 let kids: Vec<Value> = cell
512 .contents
513 .iter()
514 .flat_map(|el| elements_to_legacy(el, stem, img_idx))
515 .collect();
516 let mut obj = Map::new();
517 obj.insert("type".into(), json!("table cell"));
518 obj.insert("page number".into(), json!(page_num(&cell.bbox)));
519 obj.insert("bounding box".into(), bbox_array(&cell.bbox));
520 obj.insert("row number".into(), json!(cell.row_number + 1));
521 obj.insert("column number".into(), json!(cell.col_number + 1));
522 obj.insert("row span".into(), json!(cell.row_span));
523 obj.insert("column span".into(), json!(cell.col_span));
524 obj.insert("kids".into(), json!(kids));
525 Value::Object(obj)
526}
527
528fn table_row_to_legacy(row: &TableBorderRow, stem: &str, img_idx: &mut u64) -> Value {
529 let cells: Vec<Value> = row
530 .cells
531 .iter()
532 .map(|cell| table_cell_to_legacy(cell, stem, img_idx))
533 .collect();
534 let mut obj = Map::new();
535 obj.insert("type".into(), json!("table row"));
536 obj.insert("row number".into(), json!(row.row_number + 1));
537 obj.insert("cells".into(), json!(cells));
538 Value::Object(obj)
539}
540
541fn is_false_positive_table(tb: &TableBorder) -> bool {
555 let width = tb.bbox.right_x - tb.bbox.left_x;
556 let height = tb.bbox.top_y - tb.bbox.bottom_y;
557 let num_cells = tb.num_rows * tb.num_columns;
558
559 if num_cells == 1 && (width > 25.0 || height > 14.5) {
561 return true;
562 }
563 if num_cells > 1 && height < 30.0 {
565 return true;
566 }
567 if height > 0.0 && width / height > 8.0 && height < 25.0 {
569 return true;
570 }
571 false
572}
573
574fn table_border_to_legacy(tb: &TableBorder, stem: &str, img_idx: &mut u64) -> Option<Value> {
575 if is_false_positive_table(tb) {
576 return None;
577 }
578
579 let level = tb.level.clone().unwrap_or_else(|| "0".to_string());
580 let next_table_id: u64 = 0; let rows: Vec<Value> = tb
582 .rows
583 .iter()
584 .map(|row| table_row_to_legacy(row, stem, img_idx))
585 .collect();
586
587 let mut obj = Map::new();
588 obj.insert("type".into(), json!("table"));
589 obj.insert("id".into(), json!(next_id()));
590 obj.insert("level".into(), json!(level));
591 obj.insert("page number".into(), json!(page_num(&tb.bbox)));
592 obj.insert("bounding box".into(), bbox_array(&tb.bbox));
593 obj.insert("number of rows".into(), json!(tb.num_rows));
594 obj.insert("number of columns".into(), json!(tb.num_columns));
595 obj.insert("next table id".into(), json!(next_table_id));
596 obj.insert("rows".into(), json!(rows));
597 Some(Value::Object(obj))
598}
599
600fn semantic_table_to_legacy(st: &SemanticTable, stem: &str, img_idx: &mut u64) -> Option<Value> {
601 table_border_to_legacy(&st.table_border, stem, img_idx)
602}
603
604fn paragraph_as_caption_to_legacy(para: &SemanticParagraph) -> Value {
606 let node = ¶.base;
607 let (font, font_size, color) = text_node_style(node);
608 let mut obj = Map::new();
609 obj.insert("type".into(), json!("caption"));
610 obj.insert("id".into(), json!(next_id()));
611 obj.insert("page number".into(), json!(page_num(&node.bbox)));
612 obj.insert("bounding box".into(), bbox_array(&node.bbox));
613 obj.insert("font".into(), json!(font));
614 obj.insert("font size".into(), json!(font_size));
615 obj.insert("text color".into(), json!(color));
616 obj.insert("content".into(), json!(node_text(node)));
617 Value::Object(obj)
618}
619
620fn heading_as_caption_to_legacy(heading: &SemanticHeading) -> Value {
622 let node = &heading.base.base;
623 let (font, font_size, color) = text_node_style(node);
624 let mut obj = Map::new();
625 obj.insert("type".into(), json!("caption"));
626 obj.insert("id".into(), json!(next_id()));
627 obj.insert("page number".into(), json!(page_num(&node.bbox)));
628 obj.insert("bounding box".into(), bbox_array(&node.bbox));
629 obj.insert("font".into(), json!(font));
630 obj.insert("font size".into(), json!(font_size));
631 obj.insert("text color".into(), json!(color));
632 obj.insert("content".into(), json!(node_text(node)));
633 Value::Object(obj)
634}
635
636fn elements_to_legacy(el: &ContentElement, stem: &str, img_idx: &mut u64) -> Vec<Value> {
643 match el {
644 ContentElement::Paragraph(p) => {
645 if p.base.is_empty() {
647 return vec![];
648 }
649 if p.base.semantic_type == SemanticType::Caption {
651 vec![paragraph_as_caption_to_legacy(p)]
652 } else {
653 vec![paragraph_to_legacy(p)]
654 }
655 }
656 ContentElement::Heading(h) => {
657 if h.base.base.is_empty() {
659 return vec![];
660 }
661 if h.base.base.semantic_type == SemanticType::Caption {
663 vec![heading_as_caption_to_legacy(h)]
664 } else {
665 vec![heading_to_legacy(h)]
666 }
667 }
668 ContentElement::NumberHeading(nh) => vec![number_heading_to_legacy(nh)],
669 ContentElement::Caption(c) => vec![caption_to_legacy(c)],
670 ContentElement::HeaderFooter(hf) => vec![header_footer_to_legacy(hf, stem, img_idx)],
671 ContentElement::Figure(fig) => figure_to_legacy(fig, stem, img_idx),
672 ContentElement::Image(img) => vec![image_to_legacy(img, stem, img_idx)],
673 ContentElement::List(l) => vec![list_to_legacy(l, stem, img_idx)],
674 ContentElement::Table(st) => semantic_table_to_legacy(st, stem, img_idx)
675 .into_iter()
676 .collect(),
677 ContentElement::TableBorder(tb) => table_border_to_legacy(tb, stem, img_idx)
678 .into_iter()
679 .collect(),
680 ContentElement::TextChunk(_)
682 | ContentElement::TextLine(_)
683 | ContentElement::TextBlock(_)
684 | ContentElement::Line(_)
685 | ContentElement::LineArt(_)
686 | ContentElement::Formula(_)
687 | ContentElement::Picture(_) => vec![],
688 }
689}
690
691pub fn to_legacy_json_value(doc: &PdfDocument, stem: &str) -> Value {
700 reset_ids();
701 let mut img_idx: u64 = 0;
702
703 let kids: Vec<Value> = doc
704 .kids
705 .iter()
706 .flat_map(|el| elements_to_legacy(el, stem, &mut img_idx))
707 .collect();
708
709 let mut obj = Map::new();
710 obj.insert("file name".into(), json!(doc.file_name));
711 obj.insert("number of pages".into(), json!(doc.number_of_pages));
712 obj.insert(
713 "author".into(),
714 doc.author.as_deref().map_or(Value::Null, |s| json!(s)),
715 );
716 obj.insert(
717 "title".into(),
718 doc.title.as_deref().map_or(Value::Null, |s| json!(s)),
719 );
720 obj.insert(
721 "creation date".into(),
722 doc.creation_date
723 .as_deref()
724 .map_or(Value::Null, |s| json!(s)),
725 );
726 obj.insert(
727 "modification date".into(),
728 doc.modification_date
729 .as_deref()
730 .map_or(Value::Null, |s| json!(s)),
731 );
732 obj.insert("kids".into(), json!(kids));
733
734 Value::Object(obj)
735}
736
737pub fn to_legacy_json_string(doc: &PdfDocument, stem: &str) -> Result<String, EdgePdfError> {
742 let value = to_legacy_json_value(doc, stem);
743 serde_json::to_string_pretty(&value)
744 .map_err(|e| EdgePdfError::OutputError(format!("Legacy JSON serialization failed: {}", e)))
745}
746
747#[cfg(test)]
752mod tests {
753 use super::*;
754 use crate::models::bbox::BoundingBox;
755 use crate::models::enums::SemanticType;
756 use crate::models::semantic::{SemanticParagraph, SemanticTextNode};
757 use crate::models::text::TextColumn;
758
759 fn make_bbox(page: u32, left: f64, bottom: f64, right: f64, top: f64) -> BoundingBox {
760 BoundingBox::new(Some(page), left, bottom, right, top)
761 }
762
763 fn make_text_node(bbox: BoundingBox, text: &str) -> SemanticTextNode {
764 use crate::models::chunks::TextChunk;
765 use crate::models::enums::{PdfLayer, TextFormat, TextType};
766 use crate::models::text::{TextBlock, TextLine};
767 let chunk = TextChunk {
768 value: text.to_string(),
769 bbox: bbox.clone(),
770 font_name: "TestFont".to_string(),
771 font_size: 12.0,
772 font_weight: 400.0,
773 italic_angle: 0.0,
774 font_color: "#000000".to_string(),
775 contrast_ratio: 21.0,
776 symbol_ends: vec![],
777 text_format: TextFormat::Normal,
778 text_type: TextType::Regular,
779 pdf_layer: PdfLayer::Main,
780 ocg_visible: true,
781 index: None,
782 page_number: Some(1),
783 level: None,
784 mcid: None,
785 };
786 let line = TextLine {
787 bbox: bbox.clone(),
788 index: None,
789 level: None,
790 font_size: 12.0,
791 base_line: 2.0,
792 slant_degree: 0.0,
793 is_hidden_text: false,
794 text_chunks: vec![chunk],
795 is_line_start: true,
796 is_line_end: true,
797 is_list_line: false,
798 connected_line_art_label: None,
799 };
800 let block = TextBlock {
801 bbox: bbox.clone(),
802 index: None,
803 level: None,
804 font_size: 12.0,
805 base_line: 2.0,
806 slant_degree: 0.0,
807 is_hidden_text: false,
808 text_lines: vec![line],
809 has_start_line: true,
810 has_end_line: true,
811 text_alignment: None,
812 };
813 let col = TextColumn {
814 bbox: bbox.clone(),
815 index: None,
816 level: None,
817 font_size: 12.0,
818 base_line: 2.0,
819 slant_degree: 0.0,
820 is_hidden_text: false,
821 text_blocks: vec![block],
822 };
823 SemanticTextNode {
824 bbox,
825 index: None,
826 level: None,
827 semantic_type: SemanticType::Paragraph,
828 correct_semantic_score: None,
829 columns: vec![col],
830 font_weight: Some(400.0),
831 font_size: Some(12.0),
832 text_color: Some(vec![0.0, 0.0, 0.0]),
833 italic_angle: None,
834 font_name: Some("TestFont".to_string()),
835 text_format: None,
836 max_font_size: None,
837 background_color: None,
838 is_hidden_text: false,
839 }
840 }
841
842 #[test]
843 fn test_empty_document() {
844 let doc = PdfDocument::new("test.pdf".to_string());
845 let json = to_legacy_json_string(&doc, "test").unwrap();
846 assert!(json.contains("\"file name\""));
847 assert!(json.contains("\"number of pages\""));
848 assert!(json.contains("\"kids\""));
849 assert!(!json.contains("number_of_pages"));
850 }
851
852 #[test]
853 fn test_paragraph_has_legacy_keys() {
854 let bbox = make_bbox(1, 54.0, 100.0, 300.0, 120.0);
855 let node = make_text_node(bbox, "Hello world");
856 let para = SemanticParagraph {
857 base: node,
858 enclosed_top: false,
859 enclosed_bottom: false,
860 indentation: 0,
861 };
862 let val = paragraph_to_legacy(¶);
863 let s = serde_json::to_string_pretty(&val).unwrap();
864 assert!(s.contains("\"type\""));
865 assert!(s.contains("\"page number\""));
866 assert!(s.contains("\"bounding box\""));
867 assert!(s.contains("\"font size\""));
868 assert!(s.contains("\"text color\""));
869 assert!(s.contains("\"content\""));
870 assert!(s.contains("\"paragraph\""));
871 assert!(s.contains("[0.0, 0.0, 0.0]"));
872 }
873
874 #[test]
875 fn test_text_color_grayscale() {
876 assert_eq!(
877 text_color_string(&Some(vec![0.0, 0.0, 0.0])),
878 "[0.0, 0.0, 0.0]"
879 );
880 assert_eq!(
881 text_color_string(&Some(vec![1.0, 1.0, 1.0])),
882 "[1.0, 1.0, 1.0]"
883 );
884 assert_eq!(text_color_string(&None), "");
885 }
886
887 #[test]
888 fn test_text_color_rgb() {
889 let result = text_color_string(&Some(vec![1.0, 0.0, 0.0]));
890 assert!(result.contains("1.0") && result.contains("0.0"));
891 assert!(result.starts_with('[') && result.ends_with(']'));
892 }
893
894 #[test]
895 fn test_bbox_array_order() {
896 let bbox = make_bbox(1, 10.0, 20.0, 300.0, 400.0);
897 let arr = bbox_array(&bbox);
898 if let Value::Array(v) = arr {
899 assert_eq!(v[0].as_f64().unwrap(), 10.0);
901 assert_eq!(v[1].as_f64().unwrap(), 20.0);
902 assert_eq!(v[2].as_f64().unwrap(), 300.0);
903 assert_eq!(v[3].as_f64().unwrap(), 400.0);
904 } else {
905 panic!("Expected array");
906 }
907 }
908}