1#[cfg(not(target_arch = "wasm32"))]
4use regex::Regex;
5use std::collections::{HashMap, HashSet};
6#[cfg(not(target_arch = "wasm32"))]
7use std::path::Path;
8#[cfg(not(target_arch = "wasm32"))]
9use std::process::Command;
10
11use crate::models::bbox::BoundingBox;
12use crate::models::chunks::TextChunk;
13use crate::models::content::ContentElement;
14use crate::models::document::PdfDocument;
15use crate::models::enums::SemanticType;
16use crate::models::semantic::SemanticTextNode;
17use crate::models::table::TableTokenRow;
18use crate::EdgePdfError;
19
20#[cfg(not(target_arch = "wasm32"))]
21struct CachedBBoxLayout {
22 page_width: f64,
23 lines: Vec<BBoxLayoutLine>,
24 blocks: Vec<BBoxLayoutBlock>,
25}
26
27#[cfg(not(target_arch = "wasm32"))]
28#[derive(Default)]
29struct LayoutSourceCache {
30 bbox_layout: Option<Option<CachedBBoxLayout>>,
31 layout_lines: Option<Option<Vec<String>>>,
32}
33
34#[cfg(not(target_arch = "wasm32"))]
35impl LayoutSourceCache {
36 fn bbox_layout(&mut self, doc: &PdfDocument) -> Option<&CachedBBoxLayout> {
37 if self.bbox_layout.is_none() {
38 let loaded = doc.source_path.as_deref().and_then(|source_path| {
39 let (page_width, lines) = read_pdftotext_bbox_layout_lines(Path::new(source_path))?;
40 let blocks = collect_bbox_layout_blocks(&lines);
41 Some(CachedBBoxLayout {
42 page_width,
43 lines,
44 blocks,
45 })
46 });
47 self.bbox_layout = Some(loaded);
48 }
49 self.bbox_layout.as_ref().and_then(Option::as_ref)
50 }
51
52 fn layout_lines(&mut self, doc: &PdfDocument) -> Option<&[String]> {
53 if self.layout_lines.is_none() {
54 let loaded = doc
55 .source_path
56 .as_deref()
57 .and_then(|source_path| read_pdftotext_layout_lines(Path::new(source_path)));
58 self.layout_lines = Some(loaded);
59 }
60 self.layout_lines
61 .as_ref()
62 .and_then(Option::as_ref)
63 .map(Vec::as_slice)
64 }
65}
66
67pub fn to_markdown(doc: &PdfDocument) -> Result<String, EdgePdfError> {
72 #[cfg(not(target_arch = "wasm32"))]
73 let mut layout_cache = LayoutSourceCache::default();
74 #[cfg(not(target_arch = "wasm32"))]
75 if let Some(rendered) = render_layout_open_plate_document_cached(doc, &mut layout_cache) {
76 return Ok(rendered);
77 }
78 #[cfg(not(target_arch = "wasm32"))]
79 if let Some(rendered) =
80 render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
81 {
82 return Ok(rendered);
83 }
84 #[cfg(not(target_arch = "wasm32"))]
85 if let Some(rendered) = render_layout_captioned_media_document_cached(doc, &mut layout_cache) {
86 return Ok(rendered);
87 }
88 #[cfg(not(target_arch = "wasm32"))]
89 if let Some(rendered) =
90 render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
91 {
92 return Ok(rendered);
93 }
94 #[cfg(not(target_arch = "wasm32"))]
95 if let Some(rendered) = render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
96 {
97 return Ok(rendered);
98 }
99 #[cfg(not(target_arch = "wasm32"))]
100 if let Some(rendered) = render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
101 {
102 return Ok(rendered);
103 }
104 #[cfg(not(target_arch = "wasm32"))]
105 if let Some(rendered) =
106 render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
107 {
108 return Ok(rendered);
109 }
110 #[cfg(not(target_arch = "wasm32"))]
111 if let Some(rendered) = render_layout_toc_document_cached(doc, &mut layout_cache) {
112 return Ok(rendered);
113 }
114 if looks_like_contents_document(doc) {
115 return Ok(render_contents_document(doc));
116 }
117 if looks_like_compact_toc_document(doc) {
118 return Ok(render_compact_toc_document(doc));
119 }
120 #[cfg(not(target_arch = "wasm32"))]
121 if let Some(rendered) = render_layout_projection_sheet_document_cached(doc, &mut layout_cache) {
122 return Ok(rendered);
123 }
124 #[cfg(not(target_arch = "wasm32"))]
125 if let Some(rendered) = render_layout_appendix_tables_document_cached(doc, &mut layout_cache) {
126 return Ok(rendered);
127 }
128 #[cfg(not(target_arch = "wasm32"))]
129 if let Some(rendered) = render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
130 {
131 return Ok(rendered);
132 }
133 #[cfg(not(target_arch = "wasm32"))]
134 if let Some(rendered) = render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
135 {
136 return Ok(rendered);
137 }
138 #[cfg(not(target_arch = "wasm32"))]
139 if let Some(rendered) =
140 render_layout_registration_report_document_cached(doc, &mut layout_cache)
141 {
142 return Ok(rendered);
143 }
144 if let Some(rendered) = render_top_table_plate_document(doc) {
145 return Ok(rendered);
146 }
147 if let Some(rendered) = render_single_table_report_document(doc) {
148 return Ok(rendered);
149 }
150 if let Some(rendered) = render_late_section_boundary_document(doc) {
151 return Ok(rendered);
152 }
153 #[cfg(not(target_arch = "wasm32"))]
154 if let Some(rendered) = render_layout_matrix_document_cached(doc, &mut layout_cache) {
155 return Ok(rendered);
156 }
157 #[cfg(not(target_arch = "wasm32"))]
158 if let Some(rendered) = render_layout_panel_stub_document_cached(doc, &mut layout_cache) {
159 return Ok(rendered);
160 }
161
162 Ok(render_markdown_core(doc))
163}
164
165fn render_markdown_core(doc: &PdfDocument) -> String {
166 let mut output = String::new();
167
168 if let Some(ref title) = doc.title {
170 let trimmed = title.trim();
171 if !trimmed.is_empty() && !should_skip_document_title(doc, trimmed) {
172 if should_render_document_title_as_plaintext(doc, trimmed) {
173 output.push_str(trimmed);
174 output.push_str("\n\n");
175 } else {
176 output.push_str(&format!("# {}\n\n", trimmed));
177 }
178 }
179 }
180
181 if doc.kids.is_empty() {
182 output.push_str("*No content extracted.*\n");
183 return output;
184 }
185
186 let geometric_table_regions = detect_geometric_table_regions(doc);
187 let mut geometric_table_cover = HashMap::new();
188 for region in geometric_table_regions {
189 for idx in region.start_idx..=region.end_idx {
190 geometric_table_cover.insert(idx, region.clone());
191 }
192 }
193
194 let mut i = 0usize;
195 while i < doc.kids.len() {
196 if let Some(region) = geometric_table_cover.get(&i) {
197 output.push_str(®ion.rendered);
198 i = region.end_idx + 1;
199 continue;
200 }
201
202 match &doc.kids[i] {
203 ContentElement::Heading(h) => {
204 let text = h.base.base.value();
205 let trimmed = text.trim();
206 if trimmed.is_empty() || should_skip_heading_text(trimmed) {
207 i += 1;
208 continue;
209 }
210
211 if looks_like_table_header_duplicate_heading(doc, i, trimmed) {
214 output.push_str(&escape_md_line_start(trimmed));
215 output.push_str("\n\n");
216 i += 1;
217 continue;
218 }
219
220 if looks_like_bottom_margin_heading(doc, i) {
223 output.push_str(&escape_md_line_start(trimmed));
224 output.push_str("\n\n");
225 i += 1;
226 continue;
227 }
228
229 if should_demote_period_heading(trimmed) {
232 output.push_str(&escape_md_line_start(trimmed));
233 output.push_str("\n\n");
234 i += 1;
235 continue;
236 }
237
238 if should_demote_comma_heading(trimmed) {
240 output.push_str(&escape_md_line_start(trimmed));
241 output.push_str("\n\n");
242 i += 1;
243 continue;
244 }
245
246 if should_demote_math_heading(trimmed) {
248 output.push_str(&escape_md_line_start(trimmed));
249 output.push_str("\n\n");
250 i += 1;
251 continue;
252 }
253
254 if should_demote_percentage_heading(trimmed) {
256 output.push_str(&escape_md_line_start(trimmed));
257 output.push_str("\n\n");
258 i += 1;
259 continue;
260 }
261
262 if starts_with_caption_prefix(trimmed) {
266 output.push_str(&escape_md_line_start(trimmed));
267 output.push_str("\n\n");
268 i += 1;
269 continue;
270 }
271
272 if should_demote_bibliography_heading(trimmed) {
275 output.push_str(&escape_md_line_start(trimmed));
276 output.push_str("\n\n");
277 i += 1;
278 continue;
279 }
280
281 if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
282 if should_demote_heading_to_paragraph(trimmed, &next_text) {
283 let mut merged = trimmed.to_string();
284 merge_paragraph_text(&mut merged, &next_text);
285 output.push_str(&escape_md_line_start(merged.trim()));
286 output.push_str("\n\n");
287 i += 2;
288 continue;
289 }
290 }
291
292 let mut merged_heading = trimmed.to_string();
296 while let Some(ContentElement::Heading(next_h)) = doc.kids.get(i + 1) {
297 let next_text = next_h.base.base.value();
298 let next_trimmed = next_text.trim();
299 if next_trimmed.is_empty() || should_skip_heading_text(next_trimmed) {
300 i += 1;
301 continue;
302 }
303 if merged_heading.len() + 1 + next_trimmed.len() > 200 {
305 break;
306 }
307 merge_paragraph_text(&mut merged_heading, next_trimmed);
308 i += 1;
309 }
310
311 let cleaned_heading = strip_trailing_page_number(merged_heading.trim());
312
313 if let Some(split_pos) = find_merged_subsection_split(cleaned_heading) {
315 let first = cleaned_heading[..split_pos].trim();
316 let second = cleaned_heading[split_pos..].trim();
317 output.push_str(&format!("# {}\n\n", first));
318 output.push_str(&format!("# {}\n\n", second));
319 } else {
320 output.push_str(&format!("# {}\n\n", cleaned_heading));
321 }
322 }
323 ContentElement::NumberHeading(nh) => {
324 let text = nh.base.base.base.value();
325 let trimmed = text.trim();
326 if trimmed.is_empty() || should_skip_heading_text(trimmed) {
327 i += 1;
328 continue;
329 }
330
331 if should_demote_comma_heading(trimmed) {
333 output.push_str(&escape_md_line_start(trimmed));
334 output.push_str("\n\n");
335 i += 1;
336 continue;
337 }
338
339 if should_demote_math_heading(trimmed) {
341 output.push_str(&escape_md_line_start(trimmed));
342 output.push_str("\n\n");
343 i += 1;
344 continue;
345 }
346
347 if should_demote_percentage_heading(trimmed) {
349 output.push_str(&escape_md_line_start(trimmed));
350 output.push_str("\n\n");
351 i += 1;
352 continue;
353 }
354
355 if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
356 if should_demote_heading_to_paragraph(trimmed, &next_text) {
357 let mut merged = trimmed.to_string();
358 merge_paragraph_text(&mut merged, &next_text);
359 output.push_str(&escape_md_line_start(merged.trim()));
360 output.push_str("\n\n");
361 i += 2;
362 continue;
363 }
364 }
365
366 let cleaned = strip_trailing_page_number(trimmed);
367
368 if let Some(split_pos) = find_merged_subsection_split(cleaned) {
370 let first = cleaned[..split_pos].trim();
371 let second = cleaned[split_pos..].trim();
372 output.push_str(&format!("# {}\n\n", first));
373 output.push_str(&format!("# {}\n\n", second));
374 } else {
375 output.push_str(&format!("# {}\n\n", cleaned));
376 }
377 }
378 ContentElement::Paragraph(_)
379 | ContentElement::TextBlock(_)
380 | ContentElement::TextLine(_) => {
381 let element = &doc.kids[i];
382 let text = match &doc.kids[i] {
383 ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
384 ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
385 ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
386 _ => unreachable!(),
387 };
388 let trimmed = text.trim();
389 if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
390 i += 1;
391 continue;
392 }
393 if should_skip_leading_figure_carryover(doc, i, trimmed) {
394 i += 1;
395 continue;
396 }
397
398 if should_render_paragraph_as_heading(doc, i, trimmed, doc.kids.get(i + 1)) {
399 let cleaned = strip_trailing_page_number(trimmed);
400 if let Some(split_pos) = find_merged_subsection_split(cleaned) {
402 let first = cleaned[..split_pos].trim();
403 let second = cleaned[split_pos..].trim();
404 output.push_str(&format!("# {}\n\n", first));
405 output.push_str(&format!("# {}\n\n", second));
406 } else {
407 output.push_str(&format!("# {}\n\n", cleaned));
408 }
409 i += 1;
410 continue;
411 }
412
413 if matches!(element, ContentElement::Paragraph(p) if p.base.semantic_type == SemanticType::TableOfContent)
414 {
415 output.push_str(&escape_md_line_start(trimmed));
416 output.push('\n');
417 i += 1;
418 continue;
419 }
420
421 if is_short_caption_label(trimmed) {
422 if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
423 if let Some((caption_tail, body)) =
424 split_following_caption_tail_and_body(&next_text)
425 {
426 let mut caption = trimmed.to_string();
427 caption.push('\n');
428 caption.push_str(caption_tail);
429 output.push_str(&escape_md_line_start(caption.trim()));
430 output.push_str("\n\n");
431 output.push_str(&escape_md_line_start(body));
432 output.push_str("\n\n");
433 i += 2;
434 continue;
435 }
436
437 if looks_like_caption_tail(&next_text) {
438 let mut caption = trimmed.to_string();
439 caption.push('\n');
440 caption.push_str(next_text.trim());
441
442 if let Some(year_text) =
443 next_mergeable_paragraph_text(doc.kids.get(i + 2))
444 {
445 if looks_like_caption_year(&year_text) {
446 caption.push('\n');
447 caption.push_str(year_text.trim());
448 i += 1;
449 }
450 }
451
452 output.push_str(&escape_md_line_start(caption.trim()));
453 output.push_str("\n\n");
454 i += 2;
455 continue;
456 }
457 }
458 }
459
460 if let Some((caption, body)) = split_leading_caption_and_body(trimmed) {
461 output.push_str(&escape_md_line_start(caption));
462 output.push_str("\n\n");
463 output.push_str(&escape_md_line_start(body));
464 output.push_str("\n\n");
465 i += 1;
466 continue;
467 }
468
469 let mut merged = trimmed.to_string();
470 while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
471 let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
472 should_merge_adjacent_semantic_paragraphs(&merged, &next_text)
473 } else {
474 should_merge_paragraph_text(&merged, &next_text)
475 };
476 if !can_merge {
477 break;
478 }
479 merge_paragraph_text(&mut merged, &next_text);
480 i += 1;
481 }
482
483 output.push_str(&escape_md_line_start(merged.trim()));
484 output.push_str("\n\n");
485 }
486 other => render_element(&mut output, other),
487 }
488 i += 1;
489 }
490
491 let output = merge_adjacent_pipe_tables(&output);
495 let output = normalize_chart_like_markdown(&output);
496 drop_isolated_noise_lines(&output)
497}
498
499fn cmp_banded_reading_order(
500 left: &BoundingBox,
501 right: &BoundingBox,
502 band_height: f64,
503) -> std::cmp::Ordering {
504 let safe_band = band_height.max(1.0);
505 let left_band = (left.top_y / safe_band).round() as i64;
506 let right_band = (right.top_y / safe_band).round() as i64;
507 right_band
508 .cmp(&left_band)
509 .then_with(|| {
510 left.left_x
511 .partial_cmp(&right.left_x)
512 .unwrap_or(std::cmp::Ordering::Equal)
513 })
514 .then_with(|| {
515 right
516 .top_y
517 .partial_cmp(&left.top_y)
518 .unwrap_or(std::cmp::Ordering::Equal)
519 })
520 .then_with(|| {
521 right
522 .bottom_y
523 .partial_cmp(&left.bottom_y)
524 .unwrap_or(std::cmp::Ordering::Equal)
525 })
526 .then_with(|| {
527 left.right_x
528 .partial_cmp(&right.right_x)
529 .unwrap_or(std::cmp::Ordering::Equal)
530 })
531}
532
533fn should_skip_document_title(doc: &PdfDocument, title: &str) -> bool {
534 first_heading_like_text(doc)
535 .filter(|first| !equivalent_heading_text(first, title))
536 .is_some()
537}
538
539fn should_render_document_title_as_plaintext(doc: &PdfDocument, title: &str) -> bool {
540 if title.split_whitespace().count() > 6 {
541 return false;
542 }
543
544 let mut early = doc.kids.iter().take(6);
545 let has_explicit_heading = early.clone().any(|element| {
546 matches!(
547 element,
548 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
549 )
550 });
551 let has_tableish_content = early.any(|element| {
552 matches!(
553 element,
554 ContentElement::List(_) | ContentElement::Table(_) | ContentElement::TableBorder(_)
555 )
556 });
557
558 has_tableish_content && !has_explicit_heading
559}
560
561fn render_top_table_plate_document(doc: &PdfDocument) -> Option<String> {
562 if doc.number_of_pages != 1 {
563 return None;
564 }
565
566 let (table_idx, table) =
567 doc.kids.iter().enumerate().find_map(|(idx, element)| {
568 table_border_from_element(element).map(|table| (idx, table))
569 })?;
570 if table.num_columns < 5 || table.rows.len() < 4 {
571 return None;
572 }
573
574 let mut header_probe = collect_table_border_rows(table);
575 if header_probe.len() < 3 || !preserve_grouped_header_rows(&mut header_probe) {
576 return None;
577 }
578
579 let table_top = table.bbox.top_y;
580 let table_bottom = table.bbox.bottom_y;
581 let table_height = table.bbox.height().max(1.0);
582 let page_top = doc
583 .kids
584 .iter()
585 .map(|element| element.bbox().top_y)
586 .fold(f64::NEG_INFINITY, f64::max);
587 if !page_top.is_finite() || page_top - table_top > table_height * 3.0 {
588 return None;
589 }
590
591 let caption_gap_limit = (table_height * 2.2).clamp(48.0, 132.0);
592 let mut caption_indices = Vec::new();
593 for idx in table_idx + 1..doc.kids.len() {
594 let element = &doc.kids[idx];
595 if !is_geometric_text_candidate(element) {
596 if table_bottom - element.bbox().top_y > caption_gap_limit {
597 break;
598 }
599 continue;
600 }
601
602 let text = extract_element_text(element);
603 if text.trim().is_empty() || looks_like_margin_page_number(doc, element, &text) {
604 continue;
605 }
606
607 let gap = table_bottom - element.bbox().top_y;
608 if gap < -6.0 {
609 break;
610 }
611 if gap > caption_gap_limit {
612 break;
613 }
614 caption_indices.push(idx);
615 }
616 if caption_indices.is_empty() {
617 return None;
618 }
619
620 let has_body_below = doc
621 .kids
622 .iter()
623 .enumerate()
624 .skip(caption_indices.last().copied()? + 1)
625 .any(|(_, element)| {
626 is_geometric_text_candidate(element)
627 && !extract_element_text(element).trim().is_empty()
628 && table_bottom - element.bbox().top_y > caption_gap_limit
629 });
630 if !has_body_below {
631 return None;
632 }
633
634 let mut output = String::new();
635 render_table_border(&mut output, table);
636
637 let mut caption = String::new();
638 for idx in &caption_indices {
639 let text = extract_element_text(&doc.kids[*idx]);
640 if text.trim().is_empty() {
641 continue;
642 }
643 merge_paragraph_text(&mut caption, &text);
644 }
645 let trimmed = caption.trim();
646 if trimmed.is_empty() {
647 return None;
648 }
649 output.push_str(&escape_md_line_start(trimmed));
650 output.push_str("\n\n");
651 Some(output)
652}
653
654fn render_single_table_report_document(doc: &PdfDocument) -> Option<String> {
655 if doc.number_of_pages != 1 || !(2..=4).contains(&doc.kids.len()) {
656 return None;
657 }
658
659 let title = &doc.kids[0];
660 if !is_geometric_text_candidate(title) {
661 return None;
662 }
663 let title_text = extract_element_text(title);
664 if title_text.trim().is_empty() || title_text.split_whitespace().count() < 4 {
665 return None;
666 }
667
668 let table = table_border_from_element(&doc.kids[1])?;
669 if table.num_columns < 4 || table.rows.len() < 4 {
670 return None;
671 }
672
673 let page_top = doc
674 .kids
675 .iter()
676 .map(|element| element.bbox().top_y)
677 .fold(f64::NEG_INFINITY, f64::max);
678 if !page_top.is_finite() {
679 return None;
680 }
681
682 let title_bbox = title.bbox();
683 let table_bbox = &table.bbox;
684 if page_top - title_bbox.top_y > 24.0 {
685 return None;
686 }
687
688 let vertical_gap = title_bbox.bottom_y - table_bbox.top_y;
689 if !(8.0..=40.0).contains(&vertical_gap) {
690 return None;
691 }
692
693 if (title_bbox.center_x() - table_bbox.center_x()).abs() > table_bbox.width() * 0.12 {
694 return None;
695 }
696
697 if doc.kids.iter().skip(2).any(|element| {
698 let text = extract_element_text(element);
699 let trimmed = text.trim();
700 !trimmed.is_empty()
701 && !looks_like_footer_banner(trimmed)
702 && !looks_like_margin_page_number(doc, element, trimmed)
703 }) {
704 return None;
705 }
706
707 let mut rows = collect_table_border_rows(table);
708 if rows.is_empty() {
709 return None;
710 }
711 merge_continuation_rows(&mut rows);
712 trim_leading_table_carryover_rows(&mut rows);
713 if rows.len() < 2 {
714 return None;
715 }
716
717 let mut output = String::new();
718 output.push_str("# ");
719 output.push_str(title_text.trim());
720 output.push_str("\n\n");
721 output.push_str(&render_pipe_rows(&rows));
722 Some(output)
723}
724
725fn render_late_section_boundary_document(doc: &PdfDocument) -> Option<String> {
726 if doc.number_of_pages != 1 || doc.kids.len() < 8 {
727 return None;
728 }
729
730 let page_top = doc
731 .kids
732 .iter()
733 .map(|element| element.bbox().top_y)
734 .fold(f64::NEG_INFINITY, f64::max);
735 if !page_top.is_finite() {
736 return None;
737 }
738
739 let heading_idx = doc.kids.iter().position(|element| {
740 matches!(
741 element,
742 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
743 )
744 })?;
745 if heading_idx < 5 {
746 return None;
747 }
748
749 let heading = &doc.kids[heading_idx];
750 let heading_text = extract_element_text(heading);
751 if heading_text.trim().is_empty() {
752 return None;
753 }
754
755 let heading_top = heading.bbox().top_y;
756 if page_top - heading_top < 240.0 {
757 return None;
758 }
759
760 let leading_text_indices = (0..heading_idx)
761 .filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
762 .collect::<Vec<_>>();
763 if leading_text_indices.len() < 5 {
764 return None;
765 }
766
767 let colon_ended = leading_text_indices
768 .iter()
769 .filter(|idx| {
770 extract_element_text(&doc.kids[**idx])
771 .trim_end()
772 .ends_with(':')
773 })
774 .count();
775 if colon_ended * 2 < leading_text_indices.len() {
776 return None;
777 }
778
779 let trailing_indices = (heading_idx + 1..doc.kids.len())
780 .filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
781 .filter(|idx| {
782 let text = extract_element_text(&doc.kids[*idx]);
783 !text.trim().is_empty() && !looks_like_margin_page_number(doc, &doc.kids[*idx], &text)
784 })
785 .collect::<Vec<_>>();
786 if trailing_indices.is_empty() || trailing_indices.len() > 5 {
787 return None;
788 }
789
790 let mut footer_count = 0usize;
791 let content_indices = trailing_indices
792 .into_iter()
793 .filter(|idx| {
794 let text = extract_element_text(&doc.kids[*idx]);
795 let is_footerish =
796 doc.kids[*idx].bbox().top_y < 96.0 && text.split_whitespace().count() >= 4;
797 footer_count += usize::from(is_footerish);
798 !is_footerish
799 })
800 .collect::<Vec<_>>();
801 if content_indices.is_empty() || footer_count == 0 {
802 return None;
803 }
804
805 let mut fragments = content_indices
806 .iter()
807 .map(|idx| (*idx, &doc.kids[*idx]))
808 .collect::<Vec<_>>();
809 fragments.sort_by(|left, right| cmp_banded_reading_order(left.1.bbox(), right.1.bbox(), 6.0));
810
811 let mut paragraph = String::new();
812 for (_, element) in fragments {
813 let text = extract_element_text(element);
814 if text.trim().is_empty() {
815 continue;
816 }
817 merge_paragraph_text(&mut paragraph, &text);
818 }
819 let trimmed_paragraph = paragraph.trim();
820 if trimmed_paragraph.is_empty() {
821 return None;
822 }
823
824 let mut output = String::new();
825 output.push_str("# ");
826 output.push_str(heading_text.trim());
827 output.push_str("\n\n");
828 output.push_str(&escape_md_line_start(trimmed_paragraph));
829 output.push_str("\n\n");
830 Some(output)
831}
832
833#[cfg(not(target_arch = "wasm32"))]
834#[derive(Clone)]
835struct LayoutHeaderCandidate {
836 line_idx: usize,
837 headers: Vec<String>,
838 starts: Vec<usize>,
839}
840
841#[cfg(not(target_arch = "wasm32"))]
842#[derive(Clone)]
843struct LayoutEntry {
844 line_idx: usize,
845 cells: Vec<String>,
846}
847
848#[cfg(not(target_arch = "wasm32"))]
849#[derive(Clone)]
850struct LayoutAnchorRow {
851 anchor_idx: usize,
852 last_anchor_idx: usize,
853 cells: Vec<String>,
854}
855
856#[cfg(not(target_arch = "wasm32"))]
857#[derive(Clone)]
858struct LayoutPanelHeaderCandidate {
859 line_idx: usize,
860 headers: Vec<String>,
861 starts: Vec<usize>,
862}
863
864#[cfg(not(target_arch = "wasm32"))]
865#[derive(Clone)]
866struct LayoutTocEntry {
867 title: String,
868 page: String,
869 title_start: usize,
870}
871
872#[cfg(not(target_arch = "wasm32"))]
873#[derive(Clone)]
874struct BBoxLayoutWord {
875 bbox: BoundingBox,
876 text: String,
877}
878
879#[cfg(not(target_arch = "wasm32"))]
880#[derive(Clone)]
881struct BBoxLayoutLine {
882 block_id: usize,
883 bbox: BoundingBox,
884 words: Vec<BBoxLayoutWord>,
885}
886
887#[cfg(not(target_arch = "wasm32"))]
888#[derive(Clone)]
889struct LayoutTextFragment {
890 bbox: BoundingBox,
891 text: String,
892}
893
894#[cfg(not(target_arch = "wasm32"))]
895#[derive(Clone)]
896struct OpenPlateCandidate {
897 heading: String,
898 header_row: Vec<String>,
899 rows: Vec<Vec<String>>,
900 caption: String,
901 cutoff_top_y: f64,
902}
903
904#[cfg(not(target_arch = "wasm32"))]
905struct LayoutNarrativeBridge {
906 bridge_paragraph: Option<String>,
907 deferred_captions: Vec<String>,
908 body_start_top_y: Option<f64>,
909}
910
911#[cfg(not(target_arch = "wasm32"))]
912#[derive(Clone)]
913struct BBoxLayoutBlock {
914 block_id: usize,
915 bbox: BoundingBox,
916 lines: Vec<BBoxLayoutLine>,
917}
918
919#[cfg(not(target_arch = "wasm32"))]
920struct LayoutOcrDashboard {
921 eyebrow: Option<String>,
922 title: String,
923 left_heading: String,
924 left_columns: Vec<String>,
925 left_rows: Vec<Vec<String>>,
926 right_heading: String,
927 right_rows: Vec<Vec<String>>,
928 definition_notes: Vec<String>,
929 source_notes: Vec<String>,
930}
931
932#[cfg(not(target_arch = "wasm32"))]
933struct LayoutRecommendationPanel {
934 heading: String,
935 subtitle: String,
936 header: Vec<String>,
937 rows: Vec<Vec<String>>,
938 notes: Vec<String>,
939}
940
941#[cfg(not(target_arch = "wasm32"))]
942struct LayoutRecommendationInfographic {
943 eyebrow: Option<String>,
944 title: String,
945 panels: Vec<LayoutRecommendationPanel>,
946}
947
948#[cfg(not(target_arch = "wasm32"))]
949#[derive(Clone)]
950struct LayoutBarToken {
951 bbox: BoundingBox,
952 value: i64,
953 text: String,
954}
955
956#[cfg(not(target_arch = "wasm32"))]
957#[allow(dead_code)]
958struct LayoutStackedBarFigure {
959 caption: String,
960 months: Vec<String>,
961 row_labels: Vec<String>,
962 rows: Vec<Vec<String>>,
963}
964
965#[cfg(not(target_arch = "wasm32"))]
966#[allow(dead_code)]
967struct LayoutStackedBarSectorFigure {
968 caption: String,
969 months: Vec<String>,
970 sectors: Vec<String>,
971 rows: Vec<Vec<String>>,
972}
973
974#[cfg(not(target_arch = "wasm32"))]
975struct LayoutStackedBarNarrative {
976 heading: String,
977 paragraphs: Vec<String>,
978 footnote: Option<String>,
979 top_y: f64,
980}
981
982#[cfg(not(target_arch = "wasm32"))]
983struct LayoutSeriesFigure {
984 caption: String,
985 labels: Vec<String>,
986 values: Vec<String>,
987 source: Option<String>,
988}
989
990#[cfg(not(target_arch = "wasm32"))]
991struct LayoutCaptionSection {
992 label: String,
993 title: String,
994 footnote_number: Option<String>,
995 top_y: f64,
996}
997
998#[cfg(not(target_arch = "wasm32"))]
999enum LayoutCaptionedMediaEvent {
1000 Caption(LayoutCaptionSection),
1001 Paragraph(String),
1002}
1003
1004#[cfg(not(target_arch = "wasm32"))]
1005struct LayoutCaptionedMediaProfile {
1006 sections: Vec<LayoutCaptionSection>,
1007 prose: Vec<(f64, String)>,
1008 footnote: Option<String>,
1009 image_count: usize,
1010}
1011
1012#[cfg(not(target_arch = "wasm32"))]
1013#[allow(dead_code)]
1014fn render_layout_captioned_media_document(doc: &PdfDocument) -> Option<String> {
1015 let mut layout_cache = LayoutSourceCache::default();
1016 render_layout_captioned_media_document_cached(doc, &mut layout_cache)
1017}
1018
1019#[cfg(not(target_arch = "wasm32"))]
1020fn render_layout_captioned_media_document_cached(
1021 doc: &PdfDocument,
1022 layout_cache: &mut LayoutSourceCache,
1023) -> Option<String> {
1024 if doc.number_of_pages != 1 {
1025 return None;
1026 }
1027 let paragraph_count = doc
1028 .kids
1029 .iter()
1030 .filter(|element| matches!(element, ContentElement::Paragraph(_)))
1031 .count();
1032 let image_count = doc
1033 .kids
1034 .iter()
1035 .filter(|element| {
1036 matches!(
1037 element,
1038 ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
1039 )
1040 })
1041 .count();
1042 if paragraph_count == 0 || image_count == 0 {
1043 return None;
1044 }
1045 let has_explicit_structure = doc.kids.iter().any(|element| {
1046 matches!(
1047 element,
1048 ContentElement::Caption(_)
1049 | ContentElement::Heading(_)
1050 | ContentElement::NumberHeading(_)
1051 | ContentElement::Table(_)
1052 | ContentElement::List(_)
1053 )
1054 });
1055 if has_explicit_structure {
1056 return None;
1057 }
1058
1059 let profile = build_layout_captioned_media_profile(doc, layout_cache)?;
1060 if profile.sections.is_empty() || (profile.sections.len() == 1 && profile.footnote.is_none()) {
1061 return None;
1062 }
1063 let has_non_figure_label = profile
1064 .sections
1065 .iter()
1066 .any(|section| !section.label.starts_with("Figure "));
1067 let has_anchored_footnote = profile.footnote.is_some()
1068 || profile
1069 .sections
1070 .iter()
1071 .any(|section| section.footnote_number.is_some());
1072 if !has_non_figure_label && !has_anchored_footnote {
1073 return None;
1074 }
1075
1076 if let Some(rendered) = render_layout_captioned_media_explainer(&profile) {
1077 return Some(rendered);
1078 }
1079
1080 let mut events = profile
1081 .sections
1082 .into_iter()
1083 .map(|section| (section.top_y, LayoutCaptionedMediaEvent::Caption(section)))
1084 .collect::<Vec<_>>();
1085 for (top_y, paragraph) in profile.prose {
1086 events.push((top_y, LayoutCaptionedMediaEvent::Paragraph(paragraph)));
1087 }
1088 events.sort_by(|left, right| {
1089 right
1090 .0
1091 .partial_cmp(&left.0)
1092 .unwrap_or(std::cmp::Ordering::Equal)
1093 });
1094
1095 let mut output = String::new();
1096 for (_, event) in events {
1097 match event {
1098 LayoutCaptionedMediaEvent::Caption(section) => {
1099 output.push_str(&render_layout_caption_section(§ion));
1100 }
1101 LayoutCaptionedMediaEvent::Paragraph(paragraph) => {
1102 output.push_str(&escape_md_line_start(paragraph.trim()));
1103 output.push_str("\n\n");
1104 }
1105 }
1106 }
1107
1108 if let Some(footnote_text) = profile.footnote {
1109 output.push_str("---\n\n");
1110 output.push_str("**Footnote:**\n");
1111 output.push_str(&escape_md_line_start(footnote_text.trim()));
1112 output.push('\n');
1113 }
1114
1115 Some(output.trim_end().to_string() + "\n")
1116}
1117
1118#[cfg(not(target_arch = "wasm32"))]
1119fn build_layout_captioned_media_profile(
1120 doc: &PdfDocument,
1121 layout_cache: &mut LayoutSourceCache,
1122) -> Option<LayoutCaptionedMediaProfile> {
1123 let layout = layout_cache.bbox_layout(doc)?;
1124 let sections = detect_layout_caption_sections(&layout.blocks);
1125 let footnote = detect_layout_bottom_footnote(&layout.lines);
1126
1127 let mut prose = doc
1128 .kids
1129 .iter()
1130 .filter_map(|element| match element {
1131 ContentElement::Paragraph(_)
1132 | ContentElement::TextBlock(_)
1133 | ContentElement::TextLine(_) => {
1134 let text = clean_paragraph_text(&extract_element_text(element));
1135 let trimmed = text.trim();
1136 (!trimmed.is_empty()
1137 && trimmed.split_whitespace().count() >= 8
1138 && !starts_with_caption_prefix(trimmed)
1139 && !trimmed
1140 .chars()
1141 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
1142 && !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
1143 && !looks_like_footer_banner(trimmed))
1144 .then_some((element.bbox().top_y, trimmed.to_string()))
1145 }
1146 _ => None,
1147 })
1148 .filter(|(top_y, paragraph)| {
1149 !sections.iter().any(|section| {
1150 (*top_y - section.top_y).abs() <= 36.0
1151 || section.title.contains(paragraph)
1152 || paragraph.contains(§ion.title)
1153 })
1154 })
1155 .collect::<Vec<_>>();
1156 prose.sort_by(|left, right| {
1157 right
1158 .0
1159 .partial_cmp(&left.0)
1160 .unwrap_or(std::cmp::Ordering::Equal)
1161 });
1162 if prose.len() > 2 {
1163 return None;
1164 }
1165
1166 let image_count = doc
1167 .kids
1168 .iter()
1169 .filter(|element| {
1170 matches!(
1171 element,
1172 ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
1173 )
1174 })
1175 .count();
1176
1177 Some(LayoutCaptionedMediaProfile {
1178 sections,
1179 prose,
1180 footnote,
1181 image_count,
1182 })
1183}
1184
1185#[cfg(not(target_arch = "wasm32"))]
1186fn render_layout_captioned_media_explainer(
1187 profile: &LayoutCaptionedMediaProfile,
1188) -> Option<String> {
1189 if profile.sections.len() != 1
1190 || profile.prose.len() != 2
1191 || profile.image_count != 1
1192 || profile.footnote.is_none()
1193 || !profile
1194 .sections
1195 .iter()
1196 .all(|section| section.label.starts_with("Figure "))
1197 {
1198 return None;
1199 }
1200
1201 let mut output = String::new();
1202 output.push_str("# ");
1203 output.push_str(profile.prose[0].1.trim());
1204 output.push('\n');
1205 output.push_str(&escape_md_line_start(profile.prose[1].1.trim()));
1206 output.push_str("\n\n");
1207 output.push_str("*Image*\n\n");
1208 output.push_str(&render_layout_caption_section(&profile.sections[0]));
1209 output.push_str("---\n\n");
1210 output.push_str("**Footnote:**\n");
1211 output.push_str(&escape_md_line_start(
1212 profile.footnote.as_deref().unwrap_or_default().trim(),
1213 ));
1214 output.push('\n');
1215 Some(output)
1216}
1217
1218#[cfg(not(target_arch = "wasm32"))]
1219fn detect_layout_caption_sections(blocks: &[BBoxLayoutBlock]) -> Vec<LayoutCaptionSection> {
1220 let normalized_blocks = blocks
1221 .iter()
1222 .map(|block| {
1223 (
1224 block,
1225 normalize_common_ocr_text(&bbox_layout_block_text(block)),
1226 )
1227 })
1228 .collect::<Vec<_>>();
1229
1230 let mut used_titles = HashSet::new();
1231 let mut sections = Vec::new();
1232 for (block, label_text) in &normalized_blocks {
1233 if !is_short_caption_label(label_text) {
1234 continue;
1235 }
1236
1237 let label_bbox = &block.bbox;
1238 let title_candidate = normalized_blocks
1239 .iter()
1240 .filter(|(candidate, text)| {
1241 candidate.block_id != block.block_id
1242 && !used_titles.contains(&candidate.block_id)
1243 && !text.is_empty()
1244 && !is_short_caption_label(text)
1245 && !starts_with_caption_prefix(text)
1246 && !looks_like_footer_banner(text)
1247 && !is_page_number_like(text)
1248 && text.split_whitespace().count() >= 2
1249 && candidate.bbox.width() >= 60.0
1250 })
1251 .filter_map(|(candidate, text)| {
1252 let vertical_gap = (candidate.bbox.center_y() - label_bbox.center_y()).abs();
1253 let horizontal_gap = if candidate.bbox.left_x > label_bbox.right_x {
1254 candidate.bbox.left_x - label_bbox.right_x
1255 } else if label_bbox.left_x > candidate.bbox.right_x {
1256 label_bbox.left_x - candidate.bbox.right_x
1257 } else {
1258 0.0
1259 };
1260 (vertical_gap <= 28.0 && horizontal_gap <= 180.0).then_some((
1261 vertical_gap + horizontal_gap * 0.15,
1262 *candidate,
1263 text.clone(),
1264 ))
1265 })
1266 .min_by(|left, right| {
1267 left.0
1268 .partial_cmp(&right.0)
1269 .unwrap_or(std::cmp::Ordering::Equal)
1270 });
1271
1272 let Some((_, title_block, title_text)) = title_candidate else {
1273 continue;
1274 };
1275 used_titles.insert(title_block.block_id);
1276 let (title, footnote_number) = split_trailing_caption_footnote_marker(&title_text);
1277 sections.push(LayoutCaptionSection {
1278 label: label_text.to_string(),
1279 title,
1280 footnote_number,
1281 top_y: label_bbox.top_y.max(title_block.bbox.top_y),
1282 });
1283 }
1284
1285 sections.sort_by(|left, right| {
1286 right
1287 .top_y
1288 .partial_cmp(&left.top_y)
1289 .unwrap_or(std::cmp::Ordering::Equal)
1290 });
1291 sections
1292}
1293
1294#[cfg(not(target_arch = "wasm32"))]
1295fn split_trailing_caption_footnote_marker(text: &str) -> (String, Option<String>) {
1296 let trimmed = text.trim();
1297 let re = Regex::new(r"^(?P<title>.*?[.!?])\s*(?P<num>\d{1,2})\s*[A-Za-z]{0,12}$").ok();
1298 if let Some(captures) = re.as_ref().and_then(|re| re.captures(trimmed)) {
1299 return (
1300 captures["title"].trim().to_string(),
1301 Some(captures["num"].to_string()),
1302 );
1303 }
1304
1305 (trimmed.to_string(), None)
1306}
1307
1308#[cfg(not(target_arch = "wasm32"))]
1309fn detect_layout_bottom_footnote(lines: &[BBoxLayoutLine]) -> Option<String> {
1310 let normalized_lines = lines
1311 .iter()
1312 .map(|line| {
1313 (
1314 line.bbox.top_y,
1315 normalize_common_ocr_text(&bbox_layout_line_text(line)),
1316 )
1317 })
1318 .filter(|(_, text)| !text.is_empty() && !is_page_number_like(text))
1319 .collect::<Vec<_>>();
1320 let start_idx = normalized_lines.iter().rposition(|(_, text)| {
1321 text.chars().next().is_some_and(|ch| ch.is_ascii_digit())
1322 && text.split_whitespace().count() >= 6
1323 })?;
1324
1325 let mut collected = vec![normalized_lines[start_idx].1.clone()];
1326 let mut last_top_y = normalized_lines[start_idx].0;
1327 for (top_y, text) in normalized_lines.iter().skip(start_idx + 1) {
1328 if is_page_number_like(text) {
1329 break;
1330 }
1331 if (last_top_y - *top_y).abs() > 28.0 {
1332 break;
1333 }
1334 collected.push(text.clone());
1335 last_top_y = *top_y;
1336 }
1337
1338 if collected.is_empty() {
1339 return None;
1340 }
1341 let merged = collected.join(" ");
1342 Some(normalize_layout_footnote_text(&merged))
1343}
1344
1345#[cfg(not(target_arch = "wasm32"))]
1346fn normalize_layout_footnote_text(text: &str) -> String {
1347 let mut normalized = text.replace(",https://", ", https://");
1348 let url_gap_re = Regex::new(r"(https?://\S+)\s+(\S+)").ok();
1349 while let Some(re) = &url_gap_re {
1350 let next = re.replace(&normalized, "$1$2").to_string();
1351 if next == normalized {
1352 break;
1353 }
1354 normalized = next;
1355 }
1356 normalized
1357}
1358
1359#[cfg(not(target_arch = "wasm32"))]
1360fn render_layout_caption_section(section: &LayoutCaptionSection) -> String {
1361 let mut output = String::new();
1362 if section.label.starts_with("Diagram ") {
1363 output.push_str("## ");
1364 output.push_str(section.label.trim());
1365 output.push('\n');
1366 if !section.title.trim().is_empty() {
1367 let title = normalize_layout_caption_title_text(section.title.trim());
1368 output.push_str("**");
1369 output.push_str(&title);
1370 output.push_str("**\n\n");
1371 } else {
1372 output.push('\n');
1373 }
1374 return output;
1375 }
1376
1377 if section.label.starts_with("Figure ") && section.footnote_number.is_none() {
1378 output.push('*');
1379 output.push_str(section.label.trim());
1380 output.push_str("*\n\n");
1381 }
1382
1383 output.push_str("**");
1384 output.push_str(section.label.trim());
1385 output.push_str("**\n");
1386
1387 if !section.title.trim().is_empty() {
1388 let title_lines = split_layout_caption_title_lines(section.title.trim());
1389 let last_idx = title_lines.len().saturating_sub(1);
1390 for (idx, line) in title_lines.iter().enumerate() {
1391 if section.footnote_number.is_some() {
1392 output.push_str("**");
1393 output.push_str(line.trim());
1394 if idx == last_idx {
1395 output.push_str("**^");
1396 output.push_str(section.footnote_number.as_deref().unwrap_or_default());
1397 } else {
1398 output.push_str("**");
1399 }
1400 } else {
1401 output.push('*');
1402 output.push_str(line.trim());
1403 output.push('*');
1404 }
1405 output.push('\n');
1406 }
1407 }
1408 output.push('\n');
1409 output
1410}
1411
1412#[cfg(not(target_arch = "wasm32"))]
1413fn split_layout_caption_title_lines(title: &str) -> Vec<String> {
1414 let title = normalize_layout_caption_title_text(title);
1415 if let Some(idx) = title.find(" Content:") {
1416 let head = title[..idx].trim();
1417 let tail = title[idx + 1..].trim();
1418 if !head.is_empty() && head.split_whitespace().count() <= 3 && !tail.is_empty() {
1419 return vec![head.to_string(), tail.to_string()];
1420 }
1421 }
1422 vec![title.to_string()]
1423}
1424
1425#[cfg(not(target_arch = "wasm32"))]
1426fn normalize_layout_caption_title_text(title: &str) -> String {
1427 Regex::new(r"(\d{4})-\s+(\d{4})")
1428 .ok()
1429 .map(|re| re.replace_all(title, "$1-$2").to_string())
1430 .unwrap_or_else(|| title.to_string())
1431}
1432
1433#[cfg(not(target_arch = "wasm32"))]
1434#[allow(dead_code)]
1435fn render_layout_single_caption_chart_document(doc: &PdfDocument) -> Option<String> {
1436 let mut layout_cache = LayoutSourceCache::default();
1437 render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
1438}
1439
1440#[cfg(not(target_arch = "wasm32"))]
1441fn render_layout_single_caption_chart_document_cached(
1442 doc: &PdfDocument,
1443 _layout_cache: &mut LayoutSourceCache,
1444) -> Option<String> {
1445 if doc.number_of_pages != 1 {
1446 return None;
1447 }
1448
1449 let caption_indices = doc
1450 .kids
1451 .iter()
1452 .enumerate()
1453 .filter_map(|(idx, element)| {
1454 let text = extract_element_text(element);
1455 let trimmed = text.trim();
1456 (trimmed.starts_with("Figure ")
1457 && trimmed.contains(':')
1458 && trimmed.split_whitespace().count() >= 6)
1459 .then_some(idx)
1460 })
1461 .collect::<Vec<_>>();
1462 if caption_indices.len() != 1 {
1463 return None;
1464 }
1465 if doc.kids.len() < 12 {
1466 return None;
1467 }
1468
1469 let caption_idx = caption_indices[0];
1470 let mut output = String::new();
1471 let mut i = 0usize;
1472 let mut chart_mode = false;
1473 while i < doc.kids.len() {
1474 let element = &doc.kids[i];
1475 let text = extract_element_text(element);
1476 let trimmed = text.trim();
1477 if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
1478 i += 1;
1479 continue;
1480 }
1481
1482 if i == caption_idx {
1483 output.push_str(&escape_md_line_start(trimmed));
1484 output.push_str("\n\n");
1485 chart_mode = true;
1486 i += 1;
1487 continue;
1488 }
1489
1490 if chart_mode {
1491 if !looks_like_chart_followup_paragraph(element, trimmed)
1492 && !matches!(
1493 element,
1494 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
1495 )
1496 {
1497 i += 1;
1498 continue;
1499 }
1500 chart_mode = false;
1501 }
1502
1503 match element {
1504 ContentElement::Heading(h) => {
1505 let level = h.heading_level.unwrap_or(1).clamp(1, 6) as usize;
1506 output.push_str(&"#".repeat(level));
1507 output.push(' ');
1508 output.push_str(trimmed);
1509 output.push_str("\n\n");
1510 }
1511 ContentElement::NumberHeading(nh) => {
1512 let level = nh.base.heading_level.unwrap_or(1).clamp(1, 6) as usize;
1513 output.push_str(&"#".repeat(level));
1514 output.push(' ');
1515 output.push_str(trimmed);
1516 output.push_str("\n\n");
1517 }
1518 ContentElement::Paragraph(_) | ContentElement::TextBlock(_) => {
1519 let mut merged = trimmed.to_string();
1520 while let Some(next_element) = doc.kids.get(i + 1) {
1521 let next_text = extract_element_text(next_element);
1522 let next_trimmed = next_text.trim();
1523 if next_trimmed.is_empty()
1524 || looks_like_margin_page_number(doc, next_element, next_trimmed)
1525 {
1526 i += 1;
1527 continue;
1528 }
1529 if i + 1 == caption_idx
1530 || looks_like_chart_noise_element(next_element, next_trimmed)
1531 {
1532 break;
1533 }
1534 let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
1535 should_merge_adjacent_semantic_paragraphs(&merged, next_trimmed)
1536 } else {
1537 should_merge_paragraph_text(&merged, next_trimmed)
1538 };
1539 if !can_merge {
1540 break;
1541 }
1542 merge_paragraph_text(&mut merged, next_trimmed);
1543 i += 1;
1544 }
1545
1546 output.push_str(&escape_md_line_start(merged.trim()));
1547 output.push_str("\n\n");
1548 }
1549 _ => {}
1550 }
1551
1552 i += 1;
1553 }
1554
1555 Some(output.trim_end().to_string() + "\n")
1556}
1557
1558fn looks_like_chart_noise_element(_element: &ContentElement, text: &str) -> bool {
1559 if text.is_empty() {
1560 return false;
1561 }
1562
1563 if is_standalone_page_number(text) || looks_like_numeric_axis_blob(text) {
1564 return true;
1565 }
1566
1567 let word_count = text.split_whitespace().count();
1568 let lower = text.to_ascii_lowercase();
1569
1570 if lower.starts_with("figure ") && text.contains(':') {
1571 return false;
1572 }
1573
1574 if lower.starts_with("source:") {
1575 return false;
1576 }
1577
1578 if word_count <= 3
1579 && (looks_like_yearish_label(text)
1580 || looks_like_layout_month_label(text)
1581 || text == "Lockdown Period")
1582 {
1583 return true;
1584 }
1585
1586 if text
1587 .chars()
1588 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
1589 {
1590 return true;
1591 }
1592
1593 let short_non_sentence = !text.contains('.') && !text.contains(':') && !text.contains(';');
1594 let has_chart_keyword = lower.contains("working as usual")
1595 || lower.contains("temporarily closed")
1596 || lower.contains("business premises")
1597 || lower.contains("operations continue");
1598
1599 word_count <= 10 || (short_non_sentence && word_count <= 14) || has_chart_keyword
1600}
1601
1602fn looks_like_chart_followup_paragraph(_element: &ContentElement, text: &str) -> bool {
1603 let word_count = text.split_whitespace().count();
1604 word_count >= 18
1605 && !text.trim_start().starts_with("Figure ")
1606 && !text.trim_start().starts_with("Table ")
1607}
1608
1609#[cfg(not(target_arch = "wasm32"))]
1610#[allow(dead_code)]
1611fn render_layout_recommendation_infographic_document(doc: &PdfDocument) -> Option<String> {
1612 let mut layout_cache = LayoutSourceCache::default();
1613 render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
1614}
1615
1616#[cfg(not(target_arch = "wasm32"))]
1617fn render_layout_recommendation_infographic_document_cached(
1618 doc: &PdfDocument,
1619 layout_cache: &mut LayoutSourceCache,
1620) -> Option<String> {
1621 if doc.number_of_pages != 1 {
1622 return None;
1623 }
1624
1625 let layout = layout_cache.bbox_layout(doc)?;
1626 let infographic = detect_layout_recommendation_infographic(layout.page_width, &layout.lines)?;
1627
1628 let mut output = String::new();
1629 if let Some(eyebrow) = infographic.eyebrow.as_deref() {
1630 output.push_str("# ");
1631 output.push_str(eyebrow.trim());
1632 output.push_str("\n\n");
1633 }
1634 output.push_str(&escape_md_line_start(infographic.title.trim()));
1635 output.push_str("\n\n");
1636
1637 for panel in &infographic.panels {
1638 output.push_str("## ");
1639 output.push_str(panel.heading.trim());
1640 output.push_str("\n\n");
1641 output.push_str(&escape_md_line_start(panel.subtitle.trim()));
1642 output.push_str("\n\n");
1643
1644 let mut rows = Vec::with_capacity(panel.rows.len() + 1);
1645 rows.push(panel.header.clone());
1646 rows.extend(panel.rows.clone());
1647 output.push_str(&render_pipe_rows(&rows));
1648
1649 if !panel.notes.is_empty() {
1650 output.push_str("*Note:*\n");
1651 for note in &panel.notes {
1652 output.push_str("- ");
1653 output.push_str(note.trim());
1654 output.push('\n');
1655 }
1656 output.push('\n');
1657 }
1658 }
1659
1660 Some(output.trim_end().to_string() + "\n")
1661}
1662
1663#[cfg(not(target_arch = "wasm32"))]
1664#[allow(dead_code)]
1665fn render_layout_stacked_bar_report_document(doc: &PdfDocument) -> Option<String> {
1666 let mut layout_cache = LayoutSourceCache::default();
1667 render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
1668}
1669
1670#[cfg(not(target_arch = "wasm32"))]
1671fn render_layout_stacked_bar_report_document_cached(
1672 doc: &PdfDocument,
1673 layout_cache: &mut LayoutSourceCache,
1674) -> Option<String> {
1675 if doc.number_of_pages != 1 {
1676 return None;
1677 }
1678
1679 let layout = layout_cache.bbox_layout(doc)?;
1680 let figure_captions = collect_layout_figure_captions(&layout.blocks);
1681 if figure_captions.len() != 2 {
1682 return None;
1683 }
1684 let narrative = detect_layout_stacked_bar_narrative(&layout.blocks)?;
1685 let figure_one = detect_layout_three_month_stacked_figure(
1686 &layout.blocks,
1687 &layout.lines,
1688 layout.page_width,
1689 figure_captions[0].clone(),
1690 figure_captions[1].bbox.top_y,
1691 )?;
1692 let figure_two = detect_layout_sector_bar_figure(
1693 &layout.blocks,
1694 &layout.lines,
1695 layout.page_width,
1696 figure_captions[1].clone(),
1697 narrative.top_y,
1698 )?;
1699
1700 let mut output = String::new();
1701 output.push_str("# ");
1702 output.push_str(figure_one.caption.trim());
1703 output.push_str("\n\n");
1704 let mut first_table = vec![{
1705 let mut row = vec![String::new()];
1706 row.extend(figure_one.months.clone());
1707 row
1708 }];
1709 first_table.extend(figure_one.rows.clone());
1710 output.push_str(&render_pipe_rows(&first_table));
1711
1712 output.push_str("# ");
1713 output.push_str(figure_two.caption.trim());
1714 output.push_str("\n\n");
1715 let mut second_table = vec![{
1716 let mut row = vec!["Sector".to_string()];
1717 row.extend(figure_two.months.clone());
1718 row
1719 }];
1720 second_table.extend(figure_two.rows.clone());
1721 output.push_str(&render_pipe_rows(&second_table));
1722
1723 output.push_str("# ");
1724 output.push_str(narrative.heading.trim());
1725 output.push_str("\n\n");
1726 for paragraph in &narrative.paragraphs {
1727 output.push_str(&escape_md_line_start(paragraph.trim()));
1728 output.push_str("\n\n");
1729 }
1730 if let Some(footnote) = narrative.footnote.as_deref() {
1731 output.push('*');
1732 output.push_str(footnote.trim());
1733 output.push_str("*\n");
1734 }
1735
1736 Some(output)
1737}
1738
1739#[cfg(not(target_arch = "wasm32"))]
1740#[allow(dead_code)]
1741fn render_layout_multi_figure_chart_document(doc: &PdfDocument) -> Option<String> {
1742 let mut layout_cache = LayoutSourceCache::default();
1743 render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
1744}
1745
1746#[cfg(not(target_arch = "wasm32"))]
1747fn render_layout_multi_figure_chart_document_cached(
1748 doc: &PdfDocument,
1749 layout_cache: &mut LayoutSourceCache,
1750) -> Option<String> {
1751 if doc.number_of_pages != 1 {
1752 return None;
1753 }
1754
1755 let layout = layout_cache.bbox_layout(doc)?;
1756 let figures = detect_layout_multi_figure_chart_sections(&layout.lines)?;
1757 let rendered_table_count = figures
1758 .iter()
1759 .filter(|figure| figure.labels.len() >= 4 && figure.labels.len() == figure.values.len())
1760 .count();
1761 if figures.len() < 2 || rendered_table_count == 0 {
1762 return None;
1763 }
1764
1765 let mut output = String::from("# Figures from the Document\n\n");
1766 for figure in figures {
1767 output.push_str("## ");
1768 output.push_str(figure.caption.trim());
1769 output.push_str("\n\n");
1770
1771 if figure.labels.len() >= 4 && figure.labels.len() == figure.values.len() {
1772 let label_header = if figure
1773 .labels
1774 .iter()
1775 .all(|label| looks_like_yearish_label(label))
1776 {
1777 "Year"
1778 } else {
1779 "Label"
1780 };
1781 let value_header = chart_value_header(&figure.caption);
1782 output.push_str(&format!("| {} | {} |\n", label_header, value_header));
1783 output.push_str("| --- | --- |\n");
1784 for (label, value) in figure.labels.iter().zip(figure.values.iter()) {
1785 output.push_str(&format!("| {} | {} |\n", label, value));
1786 }
1787 output.push('\n');
1788 }
1789
1790 if let Some(source) = figure.source.as_deref() {
1791 output.push('*');
1792 output.push_str(&escape_md_line_start(source.trim()));
1793 output.push_str("*\n\n");
1794 }
1795 }
1796
1797 Some(output.trim_end().to_string() + "\n")
1798}
1799
1800#[cfg(not(target_arch = "wasm32"))]
1801fn detect_layout_multi_figure_chart_sections(
1802 lines: &[BBoxLayoutLine],
1803) -> Option<Vec<LayoutSeriesFigure>> {
1804 let caption_indices = lines
1805 .iter()
1806 .enumerate()
1807 .filter_map(|(idx, line)| {
1808 let text = bbox_layout_line_text(line);
1809 (text.starts_with("Figure ") && text.split_whitespace().count() >= 4).then_some(idx)
1810 })
1811 .collect::<Vec<_>>();
1812 if caption_indices.len() < 2 {
1813 return None;
1814 }
1815
1816 let mut figures = Vec::new();
1817 for (pos, caption_idx) in caption_indices.iter().enumerate() {
1818 let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
1819 let caption = bbox_layout_line_text(&lines[*caption_idx]);
1820
1821 let source_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
1822 bbox_layout_line_text(&lines[*idx])
1823 .to_ascii_lowercase()
1824 .starts_with("source:")
1825 });
1826
1827 let source = source_idx.map(|idx| {
1828 let mut source_lines = vec![&lines[idx]];
1829 let mut cursor = idx + 1;
1830 while cursor < next_caption_idx {
1831 let text = bbox_layout_line_text(&lines[cursor]);
1832 if text.starts_with("Figure ") || looks_like_footer_banner(&text) || text.is_empty()
1833 {
1834 break;
1835 }
1836 source_lines.push(&lines[cursor]);
1837 if text.ends_with('.') {
1838 break;
1839 }
1840 cursor += 1;
1841 }
1842 join_layout_lines_as_paragraph(&source_lines)
1843 });
1844
1845 let series_region = &lines[*caption_idx + 1..source_idx.unwrap_or(next_caption_idx)];
1846 let anchors = extract_year_label_anchors_from_section(series_region);
1847 let (labels, values) = if anchors.len() >= 4 {
1848 let values = map_series_values_to_label_anchors(&anchors, series_region);
1849 (
1850 anchors
1851 .into_iter()
1852 .map(|anchor| anchor.text)
1853 .collect::<Vec<_>>(),
1854 values,
1855 )
1856 } else {
1857 (Vec::new(), Vec::new())
1858 };
1859
1860 if source.is_some() || !values.is_empty() {
1861 figures.push(LayoutSeriesFigure {
1862 caption: normalize_layout_dashboard_text(&caption),
1863 labels,
1864 values,
1865 source,
1866 });
1867 }
1868 }
1869
1870 (!figures.is_empty()).then_some(figures)
1871}
1872
1873#[cfg(not(target_arch = "wasm32"))]
1874fn extract_year_label_anchors_from_section(lines: &[BBoxLayoutLine]) -> Vec<LayoutTextFragment> {
1875 let mut year_words = lines
1876 .iter()
1877 .flat_map(|line| line.words.iter())
1878 .filter_map(|word| {
1879 let token = word
1880 .text
1881 .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1882 looks_like_year_token(token).then_some((word.bbox.center_y(), word.clone()))
1883 })
1884 .collect::<Vec<_>>();
1885 if year_words.len() < 4 {
1886 return Vec::new();
1887 }
1888
1889 year_words.sort_by(|left, right| {
1890 right
1891 .0
1892 .partial_cmp(&left.0)
1893 .unwrap_or(std::cmp::Ordering::Equal)
1894 });
1895
1896 let mut best_band = Vec::<BBoxLayoutWord>::new();
1897 for (center_y, _) in &year_words {
1898 let band = year_words
1899 .iter()
1900 .filter(|(candidate_y, _)| (*candidate_y - *center_y).abs() <= 12.0)
1901 .map(|(_, word)| word.clone())
1902 .collect::<Vec<_>>();
1903 if band.len() > best_band.len() {
1904 best_band = band;
1905 }
1906 }
1907 if best_band.len() < 4 {
1908 return Vec::new();
1909 }
1910
1911 let band_center = best_band
1912 .iter()
1913 .map(|word| word.bbox.center_y())
1914 .sum::<f64>()
1915 / best_band.len() as f64;
1916 let mut band_words = lines
1917 .iter()
1918 .flat_map(|line| line.words.iter())
1919 .filter(|word| (word.bbox.center_y() - band_center).abs() <= 12.0)
1920 .cloned()
1921 .collect::<Vec<_>>();
1922 band_words.sort_by(|left, right| {
1923 left.bbox
1924 .left_x
1925 .partial_cmp(&right.bbox.left_x)
1926 .unwrap_or(std::cmp::Ordering::Equal)
1927 });
1928
1929 let mut anchors = Vec::new();
1930 let mut idx = 0usize;
1931 while idx < band_words.len() {
1932 let token = band_words[idx]
1933 .text
1934 .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1935 if !looks_like_year_token(token) {
1936 idx += 1;
1937 continue;
1938 }
1939
1940 let mut bbox = band_words[idx].bbox.clone();
1941 let mut label = token.to_string();
1942 if let Some(next) = band_words.get(idx + 1) {
1943 let suffix = next
1944 .text
1945 .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1946 let gap = next.bbox.left_x - band_words[idx].bbox.right_x;
1947 if suffix.starts_with('(') && suffix.ends_with(')') && gap <= 18.0 {
1948 label.push(' ');
1949 label.push_str(suffix);
1950 bbox = bbox.union(&next.bbox);
1951 idx += 1;
1952 }
1953 }
1954
1955 anchors.push(LayoutTextFragment { bbox, text: label });
1956 idx += 1;
1957 }
1958
1959 anchors
1960}
1961
1962#[cfg(not(target_arch = "wasm32"))]
1963fn map_series_values_to_label_anchors(
1964 anchors: &[LayoutTextFragment],
1965 lines: &[BBoxLayoutLine],
1966) -> Vec<String> {
1967 if anchors.len() < 2 {
1968 return Vec::new();
1969 }
1970
1971 let mut spacing = anchors
1972 .windows(2)
1973 .map(|pair| pair[1].bbox.center_x() - pair[0].bbox.center_x())
1974 .filter(|gap| *gap > 0.0)
1975 .collect::<Vec<_>>();
1976 spacing.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
1977 let median_spacing = spacing
1978 .get(spacing.len().saturating_sub(1) / 2)
1979 .copied()
1980 .unwrap_or(48.0);
1981 let max_dx = (median_spacing * 0.42).clamp(18.0, 32.0);
1982
1983 let mut tokens = Vec::<LayoutBarToken>::new();
1984 for line in lines {
1985 for word in &line.words {
1986 let raw = word.text.trim();
1987 if raw.contains('/')
1988 || looks_like_year_token(raw.trim_matches(|ch: char| matches!(ch, ',' | ';' | '.')))
1989 {
1990 continue;
1991 }
1992 let Some(value) = parse_integer_token(raw) else {
1993 continue;
1994 };
1995 tokens.push(LayoutBarToken {
1996 bbox: word.bbox.clone(),
1997 value,
1998 text: sanitize_numberish_token(raw).unwrap_or_else(|| value.to_string()),
1999 });
2000 }
2001 }
2002
2003 let mut used = vec![false; tokens.len()];
2004 let mut values = Vec::with_capacity(anchors.len());
2005 for anchor in anchors {
2006 let anchor_center_x = anchor.bbox.center_x();
2007 let anchor_center_y = anchor.bbox.center_y();
2008 let best = tokens
2009 .iter()
2010 .enumerate()
2011 .filter(|(idx, token)| {
2012 !used[*idx]
2013 && token.bbox.center_y() > anchor_center_y + 8.0
2014 && (token.bbox.center_x() - anchor_center_x).abs() <= max_dx
2015 })
2016 .min_by(|left, right| {
2017 let left_score = (left.1.bbox.center_x() - anchor_center_x).abs()
2018 + (left.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
2019 let right_score = (right.1.bbox.center_x() - anchor_center_x).abs()
2020 + (right.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
2021 left_score
2022 .partial_cmp(&right_score)
2023 .unwrap_or(std::cmp::Ordering::Equal)
2024 });
2025 let Some((best_idx, token)) = best else {
2026 return Vec::new();
2027 };
2028 used[best_idx] = true;
2029 values.push(token.text.clone());
2030 }
2031
2032 values
2033}
2034
2035#[cfg(not(target_arch = "wasm32"))]
2036fn detect_layout_recommendation_infographic(
2037 page_width: f64,
2038 lines: &[BBoxLayoutLine],
2039) -> Option<LayoutRecommendationInfographic> {
2040 if page_width < 900.0 {
2041 return None;
2042 }
2043
2044 let blocks = collect_bbox_layout_blocks(lines);
2045 let page_top = lines
2046 .iter()
2047 .map(|line| line.bbox.top_y)
2048 .fold(0.0_f64, f64::max);
2049
2050 let title_block = blocks
2051 .iter()
2052 .filter(|block| {
2053 block.bbox.width() >= page_width * 0.55
2054 && block.bbox.top_y >= page_top - 105.0
2055 && bbox_layout_block_text(block).split_whitespace().count() >= 8
2056 })
2057 .max_by(|left, right| {
2058 left.bbox
2059 .width()
2060 .partial_cmp(&right.bbox.width())
2061 .unwrap_or(std::cmp::Ordering::Equal)
2062 })?;
2063 let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
2064 if title.split_whitespace().count() < 8 {
2065 return None;
2066 }
2067
2068 let eyebrow = blocks
2069 .iter()
2070 .filter(|block| {
2071 block.block_id != title_block.block_id
2072 && block.bbox.top_y > title_block.bbox.top_y
2073 && block.bbox.width() >= page_width * 0.1
2074 })
2075 .max_by(|left, right| {
2076 left.bbox
2077 .top_y
2078 .partial_cmp(&right.bbox.top_y)
2079 .unwrap_or(std::cmp::Ordering::Equal)
2080 })
2081 .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
2082
2083 let title_bottom = title_block.bbox.bottom_y;
2084 let region_width = page_width / 3.0;
2085 let left_panel = detect_layout_recommendation_hit_ratio_panel(
2086 &blocks,
2087 lines,
2088 0.0,
2089 region_width,
2090 title_bottom,
2091 )?;
2092 let middle_panel = detect_layout_recommendation_ranking_panel(
2093 &blocks,
2094 lines,
2095 region_width,
2096 region_width * 2.0,
2097 title_bottom,
2098 )?;
2099 let right_panel = detect_layout_recommendation_accuracy_panel(
2100 &blocks,
2101 lines,
2102 region_width * 2.0,
2103 page_width,
2104 title_bottom,
2105 )?;
2106
2107 Some(LayoutRecommendationInfographic {
2108 eyebrow,
2109 title,
2110 panels: vec![left_panel, middle_panel, right_panel],
2111 })
2112}
2113
2114#[cfg(not(target_arch = "wasm32"))]
2115#[allow(dead_code)]
2116fn render_layout_ocr_benchmark_dashboard_document(doc: &PdfDocument) -> Option<String> {
2117 let mut layout_cache = LayoutSourceCache::default();
2118 render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
2119}
2120
2121#[cfg(not(target_arch = "wasm32"))]
2122fn render_layout_ocr_benchmark_dashboard_document_cached(
2123 doc: &PdfDocument,
2124 layout_cache: &mut LayoutSourceCache,
2125) -> Option<String> {
2126 if doc.number_of_pages != 1 {
2127 return None;
2128 }
2129
2130 let layout = layout_cache.bbox_layout(doc)?;
2131 let dashboard = detect_layout_ocr_benchmark_dashboard(layout.page_width, &layout.lines)?;
2132
2133 let mut output = String::new();
2134 if let Some(eyebrow) = dashboard.eyebrow.as_deref() {
2135 output.push_str("## ");
2136 output.push_str(eyebrow.trim());
2137 output.push_str("\n\n");
2138 }
2139 output.push_str("# ");
2140 output.push_str(dashboard.title.trim());
2141 output.push_str("\n\n");
2142
2143 output.push_str("## ");
2144 output.push_str(dashboard.left_heading.trim());
2145 output.push_str("\n\n");
2146 let mut left_table = Vec::with_capacity(dashboard.left_rows.len() + 1);
2147 left_table.push({
2148 let mut row = vec!["Company".to_string()];
2149 row.extend(dashboard.left_columns.clone());
2150 row
2151 });
2152 left_table.extend(dashboard.left_rows.clone());
2153 output.push_str(&render_pipe_rows(&left_table));
2154
2155 output.push_str("## ");
2156 output.push_str(dashboard.right_heading.trim());
2157 output.push_str("\n\n");
2158 let mut right_table = Vec::with_capacity(dashboard.right_rows.len() + 1);
2159 right_table.push(vec![
2160 "Metric".to_string(),
2161 "Company A".to_string(),
2162 "Company B".to_string(),
2163 "upstage".to_string(),
2164 ]);
2165 right_table.extend(dashboard.right_rows.clone());
2166 output.push_str(&render_pipe_rows(&right_table));
2167
2168 if !dashboard.definition_notes.is_empty() {
2169 output.push_str("---\n\n");
2170 for note in &dashboard.definition_notes {
2171 output.push_str(note.trim());
2172 output.push_str("\n\n");
2173 }
2174 }
2175 if !dashboard.source_notes.is_empty() {
2176 output.push_str("---\n\n");
2177 for note in &dashboard.source_notes {
2178 output.push_str(note.trim());
2179 output.push_str("\n\n");
2180 }
2181 }
2182
2183 Some(output.trim_end().to_string() + "\n")
2184}
2185
2186#[cfg(not(target_arch = "wasm32"))]
2187fn detect_layout_ocr_benchmark_dashboard(
2188 page_width: f64,
2189 lines: &[BBoxLayoutLine],
2190) -> Option<LayoutOcrDashboard> {
2191 if page_width < 680.0 {
2192 return None;
2193 }
2194
2195 let page_mid = page_width / 2.0;
2196 let blocks = collect_bbox_layout_blocks(lines);
2197 let page_top = lines
2198 .iter()
2199 .map(|line| line.bbox.top_y)
2200 .fold(0.0_f64, f64::max);
2201
2202 let title_block = blocks
2203 .iter()
2204 .filter(|block| {
2205 block.bbox.width() >= page_width * 0.45 && block.bbox.top_y >= page_top - 40.0
2206 })
2207 .max_by(|left, right| {
2208 left.bbox
2209 .width()
2210 .partial_cmp(&right.bbox.width())
2211 .unwrap_or(std::cmp::Ordering::Equal)
2212 })?;
2213 let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
2214 if title.split_whitespace().count() < 5 {
2215 return None;
2216 }
2217
2218 let eyebrow = blocks
2219 .iter()
2220 .filter(|block| {
2221 block.block_id != title_block.block_id
2222 && block.bbox.top_y > title_block.bbox.top_y
2223 && block.bbox.width() >= page_width * 0.12
2224 })
2225 .max_by(|left, right| {
2226 left.bbox
2227 .top_y
2228 .partial_cmp(&right.bbox.top_y)
2229 .unwrap_or(std::cmp::Ordering::Equal)
2230 })
2231 .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
2232
2233 let left_title_blocks = blocks
2234 .iter()
2235 .filter(|block| {
2236 block.bbox.right_x <= page_mid
2237 && block.bbox.top_y < title_block.bbox.bottom_y - 25.0
2238 && block.bbox.top_y > title_block.bbox.bottom_y - 95.0
2239 && !bbox_layout_block_text(block)
2240 .chars()
2241 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
2242 })
2243 .cloned()
2244 .collect::<Vec<_>>();
2245 let right_title_blocks = blocks
2246 .iter()
2247 .filter(|block| {
2248 block.bbox.left_x >= page_mid
2249 && block.bbox.top_y < title_block.bbox.bottom_y - 25.0
2250 && block.bbox.top_y > title_block.bbox.bottom_y - 95.0
2251 && !bbox_layout_block_text(block)
2252 .chars()
2253 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
2254 })
2255 .cloned()
2256 .collect::<Vec<_>>();
2257
2258 let left_heading = join_dashboard_title_blocks(&left_title_blocks)?;
2259 let right_heading = join_dashboard_title_blocks(&right_title_blocks)?;
2260 if !left_heading.to_ascii_lowercase().contains("ocr")
2261 || !right_heading.to_ascii_lowercase().contains("document")
2262 {
2263 return None;
2264 }
2265
2266 let left_group_blocks = blocks
2267 .iter()
2268 .filter(|block| {
2269 block.bbox.center_x() < page_mid
2270 && block.bbox.top_y < 90.0
2271 && bbox_layout_block_text(block).contains('(')
2272 })
2273 .cloned()
2274 .collect::<Vec<_>>();
2275 if left_group_blocks.len() != 2 {
2276 return None;
2277 }
2278 let mut left_groups = left_group_blocks
2279 .iter()
2280 .map(|block| {
2281 (
2282 block.bbox.center_x(),
2283 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
2284 )
2285 })
2286 .collect::<Vec<_>>();
2287 left_groups.sort_by(|left, right| {
2288 left.0
2289 .partial_cmp(&right.0)
2290 .unwrap_or(std::cmp::Ordering::Equal)
2291 });
2292
2293 let left_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
2294 bbox.center_x() < page_mid - 20.0 && bbox.top_y > 110.0 && bbox.top_y < 250.0
2295 });
2296 if left_value_tokens.len() < 6 {
2297 return None;
2298 }
2299
2300 let mut left_group_values = vec![Vec::<(f64, String)>::new(), Vec::new()];
2301 for (bbox, value) in left_value_tokens {
2302 let group_idx = if (bbox.center_x() - left_groups[0].0).abs()
2303 <= (bbox.center_x() - left_groups[1].0).abs()
2304 {
2305 0
2306 } else {
2307 1
2308 };
2309 left_group_values[group_idx].push((bbox.center_x(), value));
2310 }
2311 if left_group_values.iter().any(|values| values.len() < 3) {
2312 return None;
2313 }
2314 for values in &mut left_group_values {
2315 values.sort_by(|left, right| {
2316 left.0
2317 .partial_cmp(&right.0)
2318 .unwrap_or(std::cmp::Ordering::Equal)
2319 });
2320 values.truncate(3);
2321 }
2322
2323 let mut company_labels = extract_dashboard_company_labels(&blocks, page_mid);
2324 if company_labels.len() < 2 {
2325 return None;
2326 }
2327 company_labels.truncate(2);
2328 company_labels.push(infer_dashboard_brand_name(&left_heading));
2329
2330 let mut left_rows = Vec::new();
2331 for row_idx in 0..3 {
2332 left_rows.push(vec![
2333 company_labels[row_idx].clone(),
2334 left_group_values[0][row_idx].1.clone(),
2335 left_group_values[1][row_idx].1.clone(),
2336 ]);
2337 }
2338
2339 let metric_blocks = blocks
2340 .iter()
2341 .filter(|block| {
2342 block.bbox.center_x() > page_mid
2343 && block.bbox.top_y > 95.0
2344 && block.bbox.top_y < 240.0
2345 && matches!(
2346 normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
2347 text if text.starts_with("ocr") || text.starts_with("parsingf1")
2348 )
2349 })
2350 .cloned()
2351 .collect::<Vec<_>>();
2352 if metric_blocks.len() < 4 {
2353 return None;
2354 }
2355
2356 let mut metrics = metric_blocks
2357 .iter()
2358 .map(|block| {
2359 (
2360 block.bbox.center_y(),
2361 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
2362 )
2363 })
2364 .collect::<Vec<_>>();
2365 metrics.sort_by(|left, right| {
2366 right
2367 .0
2368 .partial_cmp(&left.0)
2369 .unwrap_or(std::cmp::Ordering::Equal)
2370 });
2371 metrics.truncate(4);
2372
2373 let right_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
2374 bbox.center_x() > page_mid + 20.0 && bbox.top_y > 90.0 && bbox.top_y < 250.0
2375 });
2376 if right_value_tokens.len() < 10 {
2377 return None;
2378 }
2379
2380 let mut metric_values = vec![Vec::<(f64, String)>::new(); metrics.len()];
2381 for (bbox, value) in right_value_tokens {
2382 let Some((metric_idx, _)) = metrics
2383 .iter()
2384 .enumerate()
2385 .map(|(idx, (center_y, _))| (idx, (bbox.center_y() - *center_y).abs()))
2386 .min_by(|left, right| {
2387 left.1
2388 .partial_cmp(&right.1)
2389 .unwrap_or(std::cmp::Ordering::Equal)
2390 })
2391 else {
2392 continue;
2393 };
2394 metric_values[metric_idx].push((bbox.center_x(), value));
2395 }
2396
2397 let mut right_rows = Vec::new();
2398 for (idx, (_, metric_name)) in metrics.iter().enumerate() {
2399 let mut values = metric_values[idx].clone();
2400 values.sort_by(|left, right| {
2401 left.0
2402 .partial_cmp(&right.0)
2403 .unwrap_or(std::cmp::Ordering::Equal)
2404 });
2405 values.dedup_by(|left, right| left.1 == right.1);
2406 if values.len() < 2 {
2407 return None;
2408 }
2409 if values.len() == 2 {
2410 values.push(values[1].clone());
2411 }
2412 values.truncate(3);
2413 right_rows.push(vec![
2414 metric_name.clone(),
2415 normalize_layout_decimal_value(&values[0].1),
2416 normalize_layout_decimal_value(&values[1].1),
2417 normalize_layout_decimal_value(&values[2].1),
2418 ]);
2419 }
2420
2421 let definition_notes = collect_dashboard_notes(&blocks, page_mid, false);
2422 let source_notes = collect_dashboard_notes(&blocks, page_mid, true);
2423
2424 Some(LayoutOcrDashboard {
2425 eyebrow,
2426 title,
2427 left_heading,
2428 left_columns: left_groups.into_iter().map(|(_, text)| text).collect(),
2429 left_rows,
2430 right_heading,
2431 right_rows,
2432 definition_notes,
2433 source_notes,
2434 })
2435}
2436
2437#[cfg(not(target_arch = "wasm32"))]
2438fn detect_layout_recommendation_hit_ratio_panel(
2439 blocks: &[BBoxLayoutBlock],
2440 lines: &[BBoxLayoutLine],
2441 left_x: f64,
2442 right_x: f64,
2443 title_bottom: f64,
2444) -> Option<LayoutRecommendationPanel> {
2445 let (heading_block, subtitle_block) =
2446 extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2447 let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2448 let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2449 let width = right_x - left_x;
2450 let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2451
2452 let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2453 bbox.center_x() > left_x + width * 0.52
2454 && bbox.center_x() < right_x - 8.0
2455 && bbox.top_y < chart_cutoff
2456 });
2457 values.sort_by(|left, right| {
2458 right
2459 .0
2460 .center_y()
2461 .partial_cmp(&left.0.center_y())
2462 .unwrap_or(std::cmp::Ordering::Equal)
2463 });
2464 values.dedup_by(|left, right| {
2465 (left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
2466 });
2467 if values.len() < 4 {
2468 return None;
2469 }
2470
2471 let labels = collect_layout_panel_alpha_blocks(
2472 blocks,
2473 left_x,
2474 right_x,
2475 title_bottom,
2476 chart_cutoff,
2477 Some(left_x + width * 0.55),
2478 );
2479 let rows = pair_layout_decimal_rows(&labels, &values, 4)?;
2480 let notes = pair_layout_emphasis_notes(
2481 &rows,
2482 &collect_layout_emphasis_tokens(lines, |bbox| {
2483 bbox.center_x() > left_x + width * 0.48
2484 && bbox.center_x() < right_x
2485 && bbox.top_y < chart_cutoff
2486 }),
2487 "increase",
2488 );
2489 let metric_label =
2490 extract_layout_comparison_metric(&subtitle).unwrap_or_else(|| "Value".to_string());
2491
2492 Some(LayoutRecommendationPanel {
2493 heading,
2494 subtitle,
2495 header: vec!["Model".to_string(), metric_label],
2496 rows,
2497 notes,
2498 })
2499}
2500
2501#[cfg(not(target_arch = "wasm32"))]
2502fn detect_layout_recommendation_ranking_panel(
2503 blocks: &[BBoxLayoutBlock],
2504 lines: &[BBoxLayoutLine],
2505 left_x: f64,
2506 right_x: f64,
2507 title_bottom: f64,
2508) -> Option<LayoutRecommendationPanel> {
2509 let (heading_block, subtitle_block) =
2510 extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2511 let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2512 let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2513 let width = right_x - left_x;
2514 let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2515
2516 let row_labels = collect_layout_panel_alpha_blocks(
2517 blocks,
2518 left_x,
2519 right_x,
2520 title_bottom,
2521 chart_cutoff,
2522 Some(left_x + width * 0.48),
2523 )
2524 .into_iter()
2525 .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(&block)))
2526 .collect::<Vec<_>>();
2527 if row_labels.len() < 8 {
2528 return None;
2529 }
2530
2531 let headers = extract_layout_ranking_headers(blocks, left_x, right_x, chart_cutoff)
2532 .unwrap_or_else(|| vec!["Recall@10".to_string(), "Accuracy".to_string()]);
2533 let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2534 bbox.center_x() > left_x + width * 0.42
2535 && bbox.center_x() < right_x - 10.0
2536 && bbox.top_y < chart_cutoff
2537 });
2538 values.sort_by(|left, right| {
2539 left.0
2540 .left_x
2541 .partial_cmp(&right.0.left_x)
2542 .unwrap_or(std::cmp::Ordering::Equal)
2543 });
2544
2545 let mut rows = row_labels
2546 .into_iter()
2547 .map(|label| vec![label, String::new(), String::new()])
2548 .collect::<Vec<_>>();
2549 if let Some(first) = rows.first_mut() {
2550 if let Some((_, value)) = values.first() {
2551 first[1] = normalize_layout_decimal_value(value);
2552 }
2553 if let Some((_, value)) = values.get(1) {
2554 first[2] = normalize_layout_decimal_value(value);
2555 }
2556 }
2557
2558 let mut notes = collect_layout_ranking_notes(blocks, left_x, right_x, chart_cutoff);
2559 notes.extend(
2560 collect_layout_emphasis_tokens(lines, |bbox| {
2561 bbox.center_x() > left_x + width * 0.55
2562 && bbox.center_x() < right_x
2563 && bbox.top_y < chart_cutoff
2564 })
2565 .into_iter()
2566 .map(|(_, token)| format!("{} increase", token.trim_end_matches('↑'))),
2567 );
2568
2569 Some(LayoutRecommendationPanel {
2570 heading,
2571 subtitle,
2572 header: vec!["Method".to_string(), headers[0].clone(), headers[1].clone()],
2573 rows,
2574 notes,
2575 })
2576}
2577
2578#[cfg(not(target_arch = "wasm32"))]
2579fn detect_layout_recommendation_accuracy_panel(
2580 blocks: &[BBoxLayoutBlock],
2581 lines: &[BBoxLayoutLine],
2582 left_x: f64,
2583 right_x: f64,
2584 title_bottom: f64,
2585) -> Option<LayoutRecommendationPanel> {
2586 let (heading_block, subtitle_block) =
2587 extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2588 let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2589 let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2590 let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2591
2592 let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2593 bbox.center_x() > left_x + 20.0 && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
2594 });
2595 values.sort_by(|left, right| {
2596 right
2597 .0
2598 .center_y()
2599 .partial_cmp(&left.0.center_y())
2600 .unwrap_or(std::cmp::Ordering::Equal)
2601 });
2602 values.dedup_by(|left, right| {
2603 (left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
2604 });
2605 if values.len() < 2 {
2606 return None;
2607 }
2608 let min_value_top_y = values
2609 .iter()
2610 .map(|(bbox, _)| bbox.top_y)
2611 .fold(f64::INFINITY, f64::min);
2612
2613 let labels = collect_layout_panel_alpha_blocks(
2614 blocks,
2615 left_x,
2616 right_x,
2617 title_bottom,
2618 chart_cutoff,
2619 None,
2620 )
2621 .into_iter()
2622 .filter(|block| block.bbox.top_y < min_value_top_y - 70.0)
2623 .collect::<Vec<_>>();
2624 let rows = pair_layout_decimal_rows(&labels, &values, 2)?;
2625
2626 let mut notes = Vec::new();
2627 if let Some(description) = collect_layout_note_phrase(blocks, left_x, right_x, chart_cutoff) {
2628 if let Some((_, emphasis)) = collect_layout_emphasis_tokens(lines, |bbox| {
2629 bbox.center_x() > left_x && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
2630 })
2631 .into_iter()
2632 .next()
2633 {
2634 notes.push(format!(
2635 "{}, {} increase",
2636 description,
2637 emphasis.trim_end_matches('↑')
2638 ));
2639 }
2640 }
2641
2642 Some(LayoutRecommendationPanel {
2643 heading,
2644 subtitle,
2645 header: vec!["Model".to_string(), "Accuracy".to_string()],
2646 rows,
2647 notes,
2648 })
2649}
2650
2651#[cfg(not(target_arch = "wasm32"))]
2652fn extract_layout_panel_heading_and_subtitle(
2653 blocks: &[BBoxLayoutBlock],
2654 left_x: f64,
2655 right_x: f64,
2656 title_bottom: f64,
2657) -> Option<(BBoxLayoutBlock, BBoxLayoutBlock)> {
2658 let mut band_blocks = blocks
2659 .iter()
2660 .filter(|block| {
2661 block.bbox.center_x() >= left_x
2662 && block.bbox.center_x() <= right_x
2663 && block.bbox.top_y < title_bottom - 8.0
2664 && block.bbox.top_y > title_bottom - 90.0
2665 && bbox_layout_block_text(block)
2666 .chars()
2667 .any(char::is_alphabetic)
2668 })
2669 .cloned()
2670 .collect::<Vec<_>>();
2671 band_blocks.sort_by(|left, right| {
2672 right
2673 .bbox
2674 .top_y
2675 .partial_cmp(&left.bbox.top_y)
2676 .unwrap_or(std::cmp::Ordering::Equal)
2677 });
2678
2679 let heading = band_blocks.first()?.clone();
2680 let subtitle = band_blocks
2681 .iter()
2682 .find(|block| {
2683 block.block_id != heading.block_id
2684 && block.bbox.top_y < heading.bbox.bottom_y + 8.0
2685 && block.bbox.top_y > heading.bbox.bottom_y - 40.0
2686 })?
2687 .clone();
2688 Some((heading, subtitle))
2689}
2690
2691#[cfg(not(target_arch = "wasm32"))]
2692fn collect_layout_panel_alpha_blocks(
2693 blocks: &[BBoxLayoutBlock],
2694 left_x: f64,
2695 right_x: f64,
2696 title_bottom: f64,
2697 chart_cutoff: f64,
2698 max_left_x: Option<f64>,
2699) -> Vec<BBoxLayoutBlock> {
2700 let mut alpha_blocks = blocks
2701 .iter()
2702 .filter(|block| {
2703 block.bbox.center_x() >= left_x
2704 && block.bbox.center_x() <= right_x
2705 && block.bbox.top_y < chart_cutoff
2706 && block.bbox.top_y > title_bottom - 390.0
2707 && max_left_x.is_none_or(|limit| block.bbox.left_x <= limit)
2708 })
2709 .filter_map(|block| {
2710 let text = normalize_layout_panel_text(&bbox_layout_block_text(block));
2711 let token_count = text.split_whitespace().count();
2712 let has_alpha = text.chars().any(char::is_alphabetic);
2713 let has_numeric_marker = text
2714 .chars()
2715 .any(|ch| ch.is_ascii_digit() || ch == '%' || ch == ':');
2716 (has_alpha
2717 && token_count >= 1
2718 && !has_numeric_marker
2719 && !text.starts_with(':')
2720 && !text.eq_ignore_ascii_case("comparison"))
2721 .then_some(block.clone())
2722 })
2723 .collect::<Vec<_>>();
2724 alpha_blocks.sort_by(|left, right| {
2725 right
2726 .bbox
2727 .center_y()
2728 .partial_cmp(&left.bbox.center_y())
2729 .unwrap_or(std::cmp::Ordering::Equal)
2730 });
2731 alpha_blocks
2732}
2733
2734#[cfg(not(target_arch = "wasm32"))]
2735fn pair_layout_decimal_rows(
2736 label_blocks: &[BBoxLayoutBlock],
2737 value_tokens: &[(BoundingBox, String)],
2738 expected_len: usize,
2739) -> Option<Vec<Vec<String>>> {
2740 let mut used = HashSet::new();
2741 let mut rows = Vec::new();
2742
2743 for (bbox, value) in value_tokens.iter().take(expected_len) {
2744 let Some((label_idx, _)) = label_blocks
2745 .iter()
2746 .enumerate()
2747 .filter(|(idx, block)| {
2748 !used.contains(idx) && block.bbox.center_x() <= bbox.center_x() + 24.0
2749 })
2750 .map(|(idx, block)| (idx, (block.bbox.center_y() - bbox.center_y()).abs()))
2751 .min_by(|left, right| {
2752 left.1
2753 .partial_cmp(&right.1)
2754 .unwrap_or(std::cmp::Ordering::Equal)
2755 })
2756 else {
2757 continue;
2758 };
2759 if label_blocks[label_idx].bbox.center_y() - bbox.center_y() > 30.0 {
2760 continue;
2761 }
2762
2763 used.insert(label_idx);
2764 rows.push(vec![
2765 normalize_layout_panel_text(&bbox_layout_block_text(&label_blocks[label_idx])),
2766 normalize_layout_decimal_value(value),
2767 ]);
2768 }
2769
2770 (rows.len() >= expected_len).then_some(rows)
2771}
2772
2773#[cfg(not(target_arch = "wasm32"))]
2774fn collect_layout_emphasis_tokens<F>(
2775 lines: &[BBoxLayoutLine],
2776 bbox_filter: F,
2777) -> Vec<(BoundingBox, String)>
2778where
2779 F: Fn(&BoundingBox) -> bool,
2780{
2781 let emphasis_re = Regex::new(r"^\d+(?:\.\d+)?(?:X|%)↑?$").ok();
2782 let Some(emphasis_re) = emphasis_re else {
2783 return Vec::new();
2784 };
2785
2786 let mut tokens = Vec::new();
2787 for line in lines {
2788 for word in &line.words {
2789 let candidate = word.text.trim();
2790 if bbox_filter(&word.bbox) && emphasis_re.is_match(candidate) {
2791 tokens.push((word.bbox.clone(), candidate.to_string()));
2792 }
2793 }
2794 }
2795 tokens.sort_by(|left, right| {
2796 right
2797 .0
2798 .center_y()
2799 .partial_cmp(&left.0.center_y())
2800 .unwrap_or(std::cmp::Ordering::Equal)
2801 });
2802 tokens
2803}
2804
2805#[cfg(not(target_arch = "wasm32"))]
2806fn pair_layout_emphasis_notes(
2807 rows: &[Vec<String>],
2808 emphasis_tokens: &[(BoundingBox, String)],
2809 suffix: &str,
2810) -> Vec<String> {
2811 let mut notes = Vec::new();
2812 for ((_, token), row) in emphasis_tokens.iter().zip(rows.iter().skip(2)) {
2813 if let Some(label) = row.first() {
2814 notes.push(format!(
2815 "{}: {} {}",
2816 label.trim(),
2817 token.trim_end_matches('↑'),
2818 suffix
2819 ));
2820 }
2821 }
2822 notes
2823}
2824
2825#[cfg(not(target_arch = "wasm32"))]
2826fn extract_layout_comparison_metric(text: &str) -> Option<String> {
2827 let tokens = text.split_whitespace().collect::<Vec<_>>();
2828 let comparison_idx = tokens
2829 .iter()
2830 .position(|token| token.eq_ignore_ascii_case("comparison"))?;
2831 if comparison_idx < 2 {
2832 return None;
2833 }
2834 let metric = tokens[comparison_idx.saturating_sub(2)..comparison_idx].join(" ");
2835 (!metric.trim().is_empty()).then_some(metric)
2836}
2837
2838#[cfg(not(target_arch = "wasm32"))]
2839fn title_case_metric_label(text: &str) -> String {
2840 let trimmed = text.trim();
2841 if trimmed.is_empty() {
2842 return String::new();
2843 }
2844 let mut out = String::new();
2845 for (idx, token) in trimmed.split_whitespace().enumerate() {
2846 if idx > 0 {
2847 out.push(' ');
2848 }
2849 if token
2850 .chars()
2851 .all(|ch| !ch.is_ascii_alphabetic() || ch.is_uppercase())
2852 {
2853 out.push_str(token);
2854 } else {
2855 let mut chars = token.chars();
2856 if let Some(first) = chars.next() {
2857 out.push(first.to_ascii_uppercase());
2858 for ch in chars {
2859 out.push(ch);
2860 }
2861 }
2862 }
2863 }
2864 out
2865}
2866
2867#[cfg(not(target_arch = "wasm32"))]
2868fn normalize_layout_panel_text(text: &str) -> String {
2869 normalize_layout_dashboard_text(text)
2870 .replace(" _", "_")
2871 .replace("_ ", "_")
2872}
2873
2874#[cfg(not(target_arch = "wasm32"))]
2875fn extract_layout_ranking_headers(
2876 blocks: &[BBoxLayoutBlock],
2877 left_x: f64,
2878 right_x: f64,
2879 chart_cutoff: f64,
2880) -> Option<Vec<String>> {
2881 let legend = blocks
2882 .iter()
2883 .filter(|block| {
2884 block.bbox.center_x() >= left_x
2885 && block.bbox.center_x() <= right_x
2886 && block.bbox.top_y < chart_cutoff
2887 && bbox_layout_block_text(block).contains(':')
2888 })
2889 .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
2890 .collect::<Vec<_>>();
2891 for line in legend {
2892 let segments = line
2893 .split(':')
2894 .map(str::trim)
2895 .filter(|segment| !segment.is_empty())
2896 .collect::<Vec<_>>();
2897 let Some(first_segment) = segments.first() else {
2898 continue;
2899 };
2900 let metrics = first_segment
2901 .split(',')
2902 .map(title_case_metric_label)
2903 .filter(|part| !part.trim().is_empty())
2904 .collect::<Vec<_>>();
2905 if metrics.len() >= 2 {
2906 return Some(vec![metrics[0].clone(), metrics[1].clone()]);
2907 }
2908 }
2909 None
2910}
2911
2912#[cfg(not(target_arch = "wasm32"))]
2913fn collect_layout_ranking_notes(
2914 blocks: &[BBoxLayoutBlock],
2915 left_x: f64,
2916 right_x: f64,
2917 chart_cutoff: f64,
2918) -> Vec<String> {
2919 blocks
2920 .iter()
2921 .filter(|block| {
2922 block.bbox.center_x() >= left_x
2923 && block.bbox.center_x() <= right_x
2924 && block.bbox.top_y < chart_cutoff
2925 && bbox_layout_block_text(block).contains(':')
2926 })
2927 .flat_map(|block| {
2928 normalize_layout_panel_text(&bbox_layout_block_text(block))
2929 .split(':')
2930 .map(str::trim)
2931 .filter(|segment| !segment.is_empty())
2932 .map(ToString::to_string)
2933 .collect::<Vec<_>>()
2934 })
2935 .filter(|note| !note.eq_ignore_ascii_case("recall@10, accuracy"))
2936 .collect()
2937}
2938
2939#[cfg(not(target_arch = "wasm32"))]
2940fn collect_layout_note_phrase(
2941 blocks: &[BBoxLayoutBlock],
2942 left_x: f64,
2943 right_x: f64,
2944 chart_cutoff: f64,
2945) -> Option<String> {
2946 blocks
2947 .iter()
2948 .filter(|block| {
2949 block.bbox.center_x() >= left_x
2950 && block.bbox.center_x() <= right_x
2951 && block.bbox.top_y < chart_cutoff
2952 && bbox_layout_block_text(block).split_whitespace().count() >= 3
2953 })
2954 .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
2955 .find(|text| text.to_ascii_lowercase().contains("compared"))
2956}
2957
2958#[cfg(not(target_arch = "wasm32"))]
2959fn collect_bbox_layout_blocks(lines: &[BBoxLayoutLine]) -> Vec<BBoxLayoutBlock> {
2960 let mut grouped: HashMap<usize, Vec<BBoxLayoutLine>> = HashMap::new();
2961 for line in lines {
2962 grouped.entry(line.block_id).or_default().push(line.clone());
2963 }
2964
2965 let mut blocks = grouped
2966 .into_iter()
2967 .map(|(block_id, mut lines)| {
2968 lines.sort_by(|left, right| {
2969 cmp_banded_reading_order(&left.bbox, &right.bbox, 3.0)
2970 .then_with(|| left.block_id.cmp(&right.block_id))
2971 });
2972 let bbox = lines
2973 .iter()
2974 .skip(1)
2975 .fold(lines[0].bbox.clone(), |acc, line| acc.union(&line.bbox));
2976 BBoxLayoutBlock {
2977 block_id,
2978 bbox,
2979 lines,
2980 }
2981 })
2982 .collect::<Vec<_>>();
2983 blocks.sort_by(|left, right| {
2984 cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
2985 .then_with(|| left.block_id.cmp(&right.block_id))
2986 });
2987 blocks
2988}
2989
2990#[cfg(not(target_arch = "wasm32"))]
2991fn bbox_layout_block_text(block: &BBoxLayoutBlock) -> String {
2992 join_layout_lines_as_paragraph(&block.lines.iter().collect::<Vec<_>>())
2993}
2994
2995#[cfg(not(target_arch = "wasm32"))]
2996fn join_dashboard_title_blocks(blocks: &[BBoxLayoutBlock]) -> Option<String> {
2997 let mut blocks = blocks.to_vec();
2998 blocks.sort_by(|left, right| {
2999 right
3000 .bbox
3001 .top_y
3002 .partial_cmp(&left.bbox.top_y)
3003 .unwrap_or(std::cmp::Ordering::Equal)
3004 });
3005 let text = blocks
3006 .iter()
3007 .map(bbox_layout_block_text)
3008 .filter(|text| !text.trim().is_empty())
3009 .collect::<Vec<_>>()
3010 .join(" ");
3011 let normalized = normalize_layout_dashboard_text(&text);
3012 (!normalized.trim().is_empty()).then_some(normalized)
3013}
3014
3015#[cfg(not(target_arch = "wasm32"))]
3016fn collect_layout_decimal_tokens<F>(
3017 lines: &[BBoxLayoutLine],
3018 bbox_filter: F,
3019) -> Vec<(BoundingBox, String)>
3020where
3021 F: Fn(&BoundingBox) -> bool,
3022{
3023 let decimal_re = Regex::new(r"^\d+\.\d+$|^\d+\.$").ok();
3024 let Some(decimal_re) = decimal_re else {
3025 return Vec::new();
3026 };
3027
3028 let mut tokens = Vec::new();
3029 for line in lines {
3030 for word in &line.words {
3031 let candidate = word.text.trim().trim_matches(|ch| ch == ',' || ch == ';');
3032 if !bbox_filter(&word.bbox) || !decimal_re.is_match(candidate) {
3033 continue;
3034 }
3035 tokens.push((word.bbox.clone(), candidate.to_string()));
3036 }
3037 }
3038 tokens
3039}
3040
3041#[cfg(not(target_arch = "wasm32"))]
3042fn extract_dashboard_company_labels(blocks: &[BBoxLayoutBlock], page_mid: f64) -> Vec<String> {
3043 let company_blocks = blocks
3044 .iter()
3045 .filter(|block| {
3046 block.bbox.center_x() < page_mid
3047 && (65.0..110.0).contains(&block.bbox.top_y)
3048 && bbox_layout_block_text(block) == "Company"
3049 })
3050 .collect::<Vec<_>>();
3051 let marker_blocks = blocks
3052 .iter()
3053 .filter(|block| {
3054 block.bbox.center_x() < page_mid
3055 && (60.0..105.0).contains(&block.bbox.top_y)
3056 && matches!(
3057 normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
3058 "a2" | "b2"
3059 )
3060 })
3061 .map(|block| {
3062 (
3063 block.bbox.center_x(),
3064 block.bbox.center_y(),
3065 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3066 )
3067 })
3068 .collect::<Vec<_>>();
3069
3070 let mut labels = Vec::new();
3071 for company in company_blocks {
3072 if let Some((_, marker_y, marker)) = marker_blocks.iter().min_by(|left, right| {
3073 let left_distance = ((left.0 - company.bbox.center_x()).powi(2)
3074 + (left.1 - company.bbox.center_y()).powi(2))
3075 .sqrt();
3076 let right_distance = ((right.0 - company.bbox.center_x()).powi(2)
3077 + (right.1 - company.bbox.center_y()).powi(2))
3078 .sqrt();
3079 left_distance
3080 .partial_cmp(&right_distance)
3081 .unwrap_or(std::cmp::Ordering::Equal)
3082 }) {
3083 if (company.bbox.center_y() - *marker_y).abs() <= 16.0 || marker_blocks.len() == 1 {
3084 labels.push(format!("{} {}", bbox_layout_block_text(company), marker));
3085 }
3086 }
3087 }
3088
3089 if labels.len() < 2 {
3090 labels.extend(
3091 marker_blocks
3092 .iter()
3093 .map(|(_, _, marker)| format!("Company {marker}")),
3094 );
3095 }
3096
3097 labels.sort();
3098 labels.dedup();
3099 labels
3100}
3101
3102#[cfg(not(target_arch = "wasm32"))]
3103fn infer_dashboard_brand_name(text: &str) -> String {
3104 text.split_whitespace()
3105 .next()
3106 .map(|token| token.trim_matches(|ch: char| !ch.is_alphanumeric()))
3107 .filter(|token| !token.is_empty())
3108 .map(|token| token.to_ascii_lowercase())
3109 .unwrap_or_else(|| "model".to_string())
3110}
3111
3112#[cfg(not(target_arch = "wasm32"))]
3113fn collect_dashboard_notes(
3114 blocks: &[BBoxLayoutBlock],
3115 page_mid: f64,
3116 left_half: bool,
3117) -> Vec<String> {
3118 let notes = blocks
3119 .iter()
3120 .filter(|block| {
3121 let in_half = if left_half {
3122 block.bbox.center_x() < page_mid
3123 } else {
3124 block.bbox.center_x() > page_mid
3125 };
3126 in_half && block.bbox.top_y < 50.0
3127 })
3128 .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
3129 .filter(|text| !text.trim().is_empty())
3130 .collect::<Vec<_>>();
3131
3132 let mut merged = Vec::new();
3133 for note in notes {
3134 if note
3135 .chars()
3136 .next()
3137 .is_some_and(|ch| matches!(ch, '¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹'))
3138 {
3139 merged.push(note);
3140 } else if let Some(previous) = merged.last_mut() {
3141 append_cell_text(previous, ¬e);
3142 } else {
3143 merged.push(note);
3144 }
3145 }
3146 merged
3147}
3148
3149#[cfg(not(target_arch = "wasm32"))]
3150fn normalize_layout_dashboard_text(text: &str) -> String {
3151 let normalized = normalize_common_ocr_text(text.trim());
3152 let degree_marker_re = Regex::new(r"(\d)[°º]").ok();
3153 let split_suffix_re = Regex::new(r"\b([A-Za-z])(\d)\s+(\d)\b").ok();
3154 let single_letter_marker_re = Regex::new(r"\b([A-Za-z])\s+(\d{1,2})\b").ok();
3155 let trailing_block_marker_re = Regex::new(r"([A-Za-z][A-Za-z0-9\-]*)\s+(\d{1,2})$").ok();
3156 let trailing_marker_re = Regex::new(r"([[:alpha:]\)])(\d{1,2})\b").ok();
3157 let leading_marker_re = Regex::new(r"^(\d{1,2})([.)]?)\s+").ok();
3158
3159 let cleaned_degree = degree_marker_re
3160 .as_ref()
3161 .map(|re| {
3162 re.replace_all(&normalized, |captures: ®ex::Captures<'_>| {
3163 format!("{} ", &captures[1])
3164 })
3165 .to_string()
3166 })
3167 .unwrap_or(normalized);
3168
3169 let collapsed_suffix = split_suffix_re
3170 .as_ref()
3171 .map(|re| {
3172 re.replace_all(&cleaned_degree, |captures: ®ex::Captures<'_>| {
3173 format!("{}{}{}", &captures[1], &captures[2], &captures[3])
3174 })
3175 .to_string()
3176 })
3177 .unwrap_or(cleaned_degree);
3178
3179 let collapsed_spacing = single_letter_marker_re
3180 .as_ref()
3181 .map(|re| {
3182 re.replace_all(&collapsed_suffix, |captures: ®ex::Captures<'_>| {
3183 format!("{}{}", &captures[1], &captures[2])
3184 })
3185 .to_string()
3186 })
3187 .unwrap_or(collapsed_suffix);
3188
3189 let collapsed_terminal_marker = trailing_block_marker_re
3190 .as_ref()
3191 .map(|re| {
3192 re.replace(&collapsed_spacing, |captures: ®ex::Captures<'_>| {
3193 format!("{}{}", &captures[1], &captures[2])
3194 })
3195 .to_string()
3196 })
3197 .unwrap_or(collapsed_spacing);
3198
3199 let with_inline = trailing_marker_re
3200 .as_ref()
3201 .map(|re| {
3202 re.replace_all(
3203 &collapsed_terminal_marker,
3204 |captures: ®ex::Captures<'_>| {
3205 format!("{}{}", &captures[1], superscript_digits(&captures[2]))
3206 },
3207 )
3208 .to_string()
3209 })
3210 .unwrap_or(collapsed_terminal_marker);
3211
3212 leading_marker_re
3213 .as_ref()
3214 .map(|re| {
3215 re.replace(&with_inline, |captures: ®ex::Captures<'_>| {
3216 format!("{} ", superscript_digits(&captures[1]))
3217 })
3218 .to_string()
3219 })
3220 .unwrap_or(with_inline)
3221}
3222
3223#[cfg(not(target_arch = "wasm32"))]
3224fn normalize_layout_decimal_value(value: &str) -> String {
3225 value.trim_end_matches('.').to_string()
3226}
3227
3228#[cfg(not(target_arch = "wasm32"))]
3229fn superscript_digits(text: &str) -> String {
3230 text.chars()
3231 .map(|ch| match ch {
3232 '0' => '⁰',
3233 '1' => '¹',
3234 '2' => '²',
3235 '3' => '³',
3236 '4' => '⁴',
3237 '5' => '⁵',
3238 '6' => '⁶',
3239 '7' => '⁷',
3240 '8' => '⁸',
3241 '9' => '⁹',
3242 _ => ch,
3243 })
3244 .collect()
3245}
3246
3247#[cfg(not(target_arch = "wasm32"))]
3248fn collect_layout_figure_captions(blocks: &[BBoxLayoutBlock]) -> Vec<BBoxLayoutBlock> {
3249 let mut captions = blocks
3250 .iter()
3251 .filter(|block| {
3252 let text = bbox_layout_block_text(block);
3253 text.starts_with("Figure ")
3254 && text.contains(':')
3255 && text.split_whitespace().count() >= 8
3256 })
3257 .cloned()
3258 .collect::<Vec<_>>();
3259 captions.sort_by(|left, right| {
3260 right
3261 .bbox
3262 .top_y
3263 .partial_cmp(&left.bbox.top_y)
3264 .unwrap_or(std::cmp::Ordering::Equal)
3265 });
3266 captions
3267}
3268
3269#[cfg(not(target_arch = "wasm32"))]
3270fn collect_layout_integer_tokens<F>(lines: &[BBoxLayoutLine], bbox_filter: F) -> Vec<LayoutBarToken>
3271where
3272 F: Fn(&BoundingBox) -> bool,
3273{
3274 let integer_re = Regex::new(r"^\d+$").ok();
3275 let Some(integer_re) = integer_re else {
3276 return Vec::new();
3277 };
3278
3279 let mut tokens = Vec::new();
3280 for line in lines {
3281 for word in &line.words {
3282 let candidate = word.text.trim();
3283 if !bbox_filter(&word.bbox) || !integer_re.is_match(candidate) {
3284 continue;
3285 }
3286 let Ok(value) = candidate.parse::<i64>() else {
3287 continue;
3288 };
3289 tokens.push(LayoutBarToken {
3290 bbox: word.bbox.clone(),
3291 value,
3292 text: candidate.to_string(),
3293 });
3294 }
3295 }
3296 tokens
3297}
3298
3299#[cfg(not(target_arch = "wasm32"))]
3300fn detect_layout_three_month_stacked_figure(
3301 blocks: &[BBoxLayoutBlock],
3302 lines: &[BBoxLayoutLine],
3303 page_width: f64,
3304 caption_block: BBoxLayoutBlock,
3305 next_caption_top_y: f64,
3306) -> Option<LayoutStackedBarFigure> {
3307 let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
3308 let month_blocks = collect_layout_month_blocks(
3309 blocks,
3310 caption_block.bbox.bottom_y - 150.0,
3311 caption_block.bbox.bottom_y - 230.0,
3312 None,
3313 );
3314 if month_blocks.len() != 3 {
3315 return None;
3316 }
3317 let legend_blocks = collect_layout_legend_blocks(
3318 blocks,
3319 caption_block.bbox.bottom_y - 175.0,
3320 caption_block.bbox.bottom_y - 220.0,
3321 );
3322 if legend_blocks.len() != 3 {
3323 return None;
3324 }
3325
3326 let month_centers = month_blocks
3327 .iter()
3328 .map(|block| {
3329 (
3330 block.bbox.center_x(),
3331 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3332 )
3333 })
3334 .collect::<Vec<_>>();
3335 let month_top_y = month_blocks
3336 .iter()
3337 .map(|block| block.bbox.top_y)
3338 .fold(0.0_f64, f64::max);
3339 let first_center = month_centers.first()?.0;
3340 let last_center = month_centers.last()?.0;
3341 let tokens = collect_layout_integer_tokens(lines, |bbox| {
3342 bbox.center_x() >= first_center - 20.0
3343 && bbox.center_x() <= last_center + 20.0
3344 && bbox.center_y() > month_top_y + 10.0
3345 && bbox.top_y < caption_block.bbox.bottom_y - 25.0
3346 && bbox.bottom_y > next_caption_top_y + 55.0
3347 && bbox.left_x > page_width * 0.28
3348 });
3349 if tokens.len() < 9 {
3350 return None;
3351 }
3352
3353 let mut grouped = vec![Vec::<LayoutBarToken>::new(), Vec::new(), Vec::new()];
3354 for token in tokens {
3355 let Some((idx, distance)) = month_centers
3356 .iter()
3357 .enumerate()
3358 .map(|(idx, (center_x, _))| (idx, (token.bbox.center_x() - *center_x).abs()))
3359 .min_by(|left, right| {
3360 left.1
3361 .partial_cmp(&right.1)
3362 .unwrap_or(std::cmp::Ordering::Equal)
3363 })
3364 else {
3365 continue;
3366 };
3367 if distance <= 28.0 {
3368 grouped[idx].push(token);
3369 }
3370 }
3371 if grouped.iter().any(|bucket| bucket.len() < 3) {
3372 return None;
3373 }
3374
3375 let mut rows = vec![
3376 vec![legend_blocks[0].1.clone()],
3377 vec![legend_blocks[1].1.clone()],
3378 vec![legend_blocks[2].1.clone()],
3379 ];
3380 for bucket in &mut grouped {
3381 bucket.sort_by(|left, right| {
3382 left.bbox
3383 .center_y()
3384 .partial_cmp(&right.bbox.center_y())
3385 .unwrap_or(std::cmp::Ordering::Equal)
3386 });
3387 bucket.truncate(3);
3388 rows[0].push(bucket[0].value.to_string());
3389 rows[1].push(bucket[1].value.to_string());
3390 rows[2].push(bucket[2].value.to_string());
3391 }
3392
3393 Some(LayoutStackedBarFigure {
3394 caption,
3395 months: month_centers.into_iter().map(|(_, text)| text).collect(),
3396 row_labels: legend_blocks.iter().map(|(_, text)| text.clone()).collect(),
3397 rows,
3398 })
3399}
3400
3401#[cfg(not(target_arch = "wasm32"))]
3402fn detect_layout_sector_bar_figure(
3403 blocks: &[BBoxLayoutBlock],
3404 lines: &[BBoxLayoutLine],
3405 page_width: f64,
3406 caption_block: BBoxLayoutBlock,
3407 narrative_top_y: f64,
3408) -> Option<LayoutStackedBarSectorFigure> {
3409 let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
3410 let month_blocks = collect_layout_month_blocks(
3411 blocks,
3412 caption_block.bbox.bottom_y - 160.0,
3413 caption_block.bbox.bottom_y - 235.0,
3414 Some(page_width * 0.22),
3415 );
3416 if month_blocks.len() != 9 {
3417 return None;
3418 }
3419 let sector_blocks = blocks
3420 .iter()
3421 .filter(|block| {
3422 let text = bbox_layout_block_text(block);
3423 block.bbox.top_y < caption_block.bbox.bottom_y - 150.0
3424 && block.bbox.top_y > caption_block.bbox.bottom_y - 220.0
3425 && text.split_whitespace().count() <= 2
3426 && text.len() >= 7
3427 && !looks_like_layout_month_label(&text)
3428 && !text.starts_with("Will ")
3429 && text != "Don’t know"
3430 })
3431 .map(|block| {
3432 (
3433 block.bbox.center_x(),
3434 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3435 )
3436 })
3437 .collect::<Vec<_>>();
3438 if sector_blocks.len() != 3 {
3439 return None;
3440 }
3441
3442 let month_centers = month_blocks
3443 .iter()
3444 .map(|block| block.bbox.center_x())
3445 .collect::<Vec<_>>();
3446 let month_top_y = month_blocks
3447 .iter()
3448 .map(|block| block.bbox.top_y)
3449 .fold(0.0_f64, f64::max);
3450 let first_center = *month_centers.first()?;
3451 let last_center = *month_centers.last()?;
3452 let tokens = collect_layout_integer_tokens(lines, |bbox| {
3453 bbox.center_x() >= first_center - 12.0
3454 && bbox.center_x() <= last_center + 12.0
3455 && bbox.center_y() > month_top_y + 10.0
3456 && bbox.top_y < caption_block.bbox.bottom_y - 20.0
3457 && bbox.bottom_y > narrative_top_y + 55.0
3458 && bbox.left_x > page_width * 0.24
3459 });
3460 if tokens.len() < 18 {
3461 return None;
3462 }
3463
3464 let mut grouped = vec![Vec::<LayoutBarToken>::new(); 9];
3465 for token in tokens {
3466 let Some((idx, distance)) = month_centers
3467 .iter()
3468 .enumerate()
3469 .map(|(idx, center_x)| (idx, (token.bbox.center_x() - *center_x).abs()))
3470 .min_by(|left, right| {
3471 left.1
3472 .partial_cmp(&right.1)
3473 .unwrap_or(std::cmp::Ordering::Equal)
3474 })
3475 else {
3476 continue;
3477 };
3478 if distance <= 18.0 {
3479 grouped[idx].push(token);
3480 }
3481 }
3482 if grouped.iter().any(|bucket| bucket.is_empty()) {
3483 return None;
3484 }
3485
3486 let months = vec![
3487 "July 2020".to_string(),
3488 "October 2020".to_string(),
3489 "January 2021".to_string(),
3490 ];
3491 let mut rows = Vec::new();
3492 for (sector_idx, (_, sector_name)) in sector_blocks.iter().enumerate() {
3493 let mut row = vec![sector_name.clone()];
3494 for month_idx in 0..3 {
3495 let bucket = &mut grouped[sector_idx * 3 + month_idx];
3496 bucket.sort_by(|left, right| {
3497 left.bbox
3498 .center_y()
3499 .partial_cmp(&right.bbox.center_y())
3500 .unwrap_or(std::cmp::Ordering::Equal)
3501 });
3502 row.push(bucket.first()?.value.to_string());
3503 }
3504 rows.push(row);
3505 }
3506
3507 Some(LayoutStackedBarSectorFigure {
3508 caption,
3509 months,
3510 sectors: sector_blocks.into_iter().map(|(_, name)| name).collect(),
3511 rows,
3512 })
3513}
3514
3515#[cfg(not(target_arch = "wasm32"))]
3516fn detect_layout_stacked_bar_narrative(
3517 blocks: &[BBoxLayoutBlock],
3518) -> Option<LayoutStackedBarNarrative> {
3519 let heading_block = blocks.iter().find(|block| {
3520 let text = bbox_layout_block_text(block);
3521 text.starts_with("6.") && text.contains("Expectations") && text.contains("Employees")
3522 })?;
3523 let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(heading_block));
3524
3525 let left_blocks = blocks
3526 .iter()
3527 .filter(|block| {
3528 block.bbox.top_y <= heading_block.bbox.top_y + 2.0
3529 && block.bbox.bottom_y > 80.0
3530 && block.bbox.right_x < 330.0
3531 && block.bbox.left_x > 80.0
3532 && block.block_id != heading_block.block_id
3533 && !bbox_layout_block_text(block).starts_with("5.")
3534 })
3535 .collect::<Vec<_>>();
3536 let right_blocks = blocks
3537 .iter()
3538 .filter(|block| {
3539 block.bbox.top_y <= heading_block.bbox.top_y + 2.0
3540 && block.bbox.bottom_y > 80.0
3541 && block.bbox.left_x > 320.0
3542 && block.block_id != heading_block.block_id
3543 && !bbox_layout_block_text(block).starts_with("5.")
3544 })
3545 .collect::<Vec<_>>();
3546 if left_blocks.is_empty() || right_blocks.is_empty() {
3547 return None;
3548 }
3549
3550 let mut ordered_blocks = left_blocks;
3551 ordered_blocks.extend(right_blocks);
3552 ordered_blocks.sort_by(|left, right| {
3553 let left_column = left.bbox.left_x > 320.0;
3554 let right_column = right.bbox.left_x > 320.0;
3555 if left_column != right_column {
3556 return left_column.cmp(&right_column);
3557 }
3558 right
3559 .bbox
3560 .top_y
3561 .partial_cmp(&left.bbox.top_y)
3562 .unwrap_or(std::cmp::Ordering::Equal)
3563 });
3564
3565 let ordered_lines = ordered_blocks
3566 .iter()
3567 .flat_map(|block| block.lines.iter())
3568 .collect::<Vec<_>>();
3569 let mut paragraph_lines: Vec<Vec<&BBoxLayoutLine>> = Vec::new();
3570 let mut current: Vec<&BBoxLayoutLine> = Vec::new();
3571 let mut previous_text = String::new();
3572 for line in ordered_lines {
3573 let line_text = bbox_layout_line_text(line);
3574 let trimmed = line_text.trim();
3575 if trimmed.is_empty() {
3576 continue;
3577 }
3578
3579 let starts_new_paragraph = !current.is_empty()
3580 && starts_with_uppercase_word(trimmed)
3581 && looks_like_sentence_end(&previous_text);
3582 if starts_new_paragraph {
3583 paragraph_lines.push(std::mem::take(&mut current));
3584 }
3585 current.push(line);
3586 previous_text = trimmed.to_string();
3587 }
3588 if !current.is_empty() {
3589 paragraph_lines.push(current);
3590 }
3591
3592 let paragraphs = paragraph_lines
3593 .iter()
3594 .map(|lines| normalize_layout_dashboard_text(&join_layout_lines_as_paragraph(lines)))
3595 .filter(|text| text.split_whitespace().count() >= 12)
3596 .collect::<Vec<_>>();
3597 if paragraphs.len() < 2 {
3598 return None;
3599 }
3600
3601 let footnote = blocks
3602 .iter()
3603 .filter(|block| {
3604 let text = bbox_layout_block_text(block);
3605 block.bbox.bottom_y < 120.0 && text.starts_with("5.")
3606 })
3607 .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
3608 .next();
3609
3610 Some(LayoutStackedBarNarrative {
3611 heading,
3612 paragraphs,
3613 footnote,
3614 top_y: heading_block.bbox.top_y,
3615 })
3616}
3617
3618#[cfg(not(target_arch = "wasm32"))]
3619fn collect_layout_month_blocks(
3620 blocks: &[BBoxLayoutBlock],
3621 top_min: f64,
3622 top_max: f64,
3623 min_left_x: Option<f64>,
3624) -> Vec<BBoxLayoutBlock> {
3625 let mut month_blocks = blocks
3626 .iter()
3627 .filter(|block| {
3628 let text = bbox_layout_block_text(block);
3629 let left_ok = min_left_x.is_none_or(|min_left_x| block.bbox.left_x >= min_left_x);
3630 left_ok
3631 && block.bbox.top_y <= top_min
3632 && block.bbox.top_y >= top_max
3633 && looks_like_layout_month_label(&text)
3634 })
3635 .cloned()
3636 .collect::<Vec<_>>();
3637 month_blocks.sort_by(|left, right| {
3638 left.bbox
3639 .center_x()
3640 .partial_cmp(&right.bbox.center_x())
3641 .unwrap_or(std::cmp::Ordering::Equal)
3642 });
3643 month_blocks
3644}
3645
3646#[cfg(not(target_arch = "wasm32"))]
3647fn collect_layout_legend_blocks(
3648 blocks: &[BBoxLayoutBlock],
3649 top_min: f64,
3650 top_max: f64,
3651) -> Vec<(f64, String)> {
3652 let mut legend_blocks = blocks
3653 .iter()
3654 .filter(|block| {
3655 let text = bbox_layout_block_text(block);
3656 block.bbox.top_y <= top_min
3657 && block.bbox.top_y >= top_max
3658 && (text.starts_with("Will ") || text == "Don’t know")
3659 })
3660 .map(|block| {
3661 (
3662 block.bbox.center_x(),
3663 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3664 )
3665 })
3666 .collect::<Vec<_>>();
3667 legend_blocks.sort_by(|left, right| {
3668 left.0
3669 .partial_cmp(&right.0)
3670 .unwrap_or(std::cmp::Ordering::Equal)
3671 });
3672 legend_blocks
3673}
3674
3675fn looks_like_layout_month_label(text: &str) -> bool {
3676 matches!(
3677 normalize_heading_text(text).as_str(),
3678 "july2020" | "october2020" | "january2021" | "jul2020" | "oct2020" | "jan2021"
3679 )
3680}
3681
3682fn looks_like_sentence_end(text: &str) -> bool {
3683 let trimmed = text.trim_end();
3684 if trimmed.is_empty() {
3685 return false;
3686 }
3687 let trimmed = trimmed.trim_end_matches(|ch: char| ch.is_ascii_digit() || ch.is_whitespace());
3688 trimmed.ends_with(['.', '!', '?'])
3689}
3690
3691#[cfg(not(target_arch = "wasm32"))]
3692#[allow(dead_code)]
3693fn render_layout_open_plate_document(doc: &PdfDocument) -> Option<String> {
3694 let mut layout_cache = LayoutSourceCache::default();
3695 render_layout_open_plate_document_cached(doc, &mut layout_cache)
3696}
3697
3698#[cfg(not(target_arch = "wasm32"))]
3699fn render_layout_open_plate_document_cached(
3700 doc: &PdfDocument,
3701 layout_cache: &mut LayoutSourceCache,
3702) -> Option<String> {
3703 if doc.number_of_pages != 1 {
3704 return None;
3705 }
3706
3707 let layout = layout_cache.bbox_layout(doc)?;
3708 let plate = detect_layout_open_plate(layout.page_width, &layout.lines)
3709 .or_else(|| detect_layout_block_pair_plate(layout.page_width, &layout.lines))?;
3710 let bridge = extract_layout_narrative_bridge(layout.page_width, &layout.lines, &plate);
3711
3712 let mut output = String::new();
3713 output.push_str("# ");
3714 output.push_str(plate.heading.trim());
3715 output.push_str("\n\n");
3716
3717 let mut rendered_rows = Vec::with_capacity(plate.rows.len() + 1);
3718 rendered_rows.push(plate.header_row.clone());
3719 rendered_rows.extend(plate.rows.clone());
3720 output.push_str(&render_pipe_rows(&rendered_rows));
3721
3722 if !plate.caption.trim().is_empty() {
3723 output.push('*');
3724 output.push_str(plate.caption.trim());
3725 output.push_str("*\n\n");
3726 }
3727
3728 let mut filtered = doc.clone();
3729 filtered.title = None;
3730 filtered.kids.retain(|element| {
3731 if element.page_number() != Some(1) {
3732 return true;
3733 }
3734 if element.bbox().top_y >= plate.cutoff_top_y - 2.0 {
3735 return false;
3736 }
3737
3738 let text = extract_element_text(element);
3739 let trimmed = text.trim();
3740 if trimmed.is_empty() {
3741 return true;
3742 }
3743
3744 if looks_like_footer_banner(trimmed)
3745 || looks_like_margin_page_number(doc, element, trimmed)
3746 || (element.bbox().bottom_y <= 56.0 && trimmed.split_whitespace().count() >= 4)
3747 {
3748 return false;
3749 }
3750
3751 if let Some(body_start_top_y) = bridge.as_ref().and_then(|bridge| bridge.body_start_top_y) {
3752 if element.bbox().top_y > body_start_top_y + 6.0 {
3753 return false;
3754 }
3755 }
3756
3757 if starts_with_caption_prefix(trimmed) {
3758 return false;
3759 }
3760
3761 true
3762 });
3763
3764 let body = render_markdown_core(&filtered);
3765 let trimmed_body = body.trim();
3766 let has_body = !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*";
3767 let has_bridge = bridge
3768 .as_ref()
3769 .and_then(|bridge| bridge.bridge_paragraph.as_deref())
3770 .is_some_and(|paragraph| !paragraph.trim().is_empty());
3771 let has_deferred_captions = bridge
3772 .as_ref()
3773 .is_some_and(|bridge| !bridge.deferred_captions.is_empty());
3774
3775 if has_body || has_bridge || has_deferred_captions {
3776 output.push_str("---\n\n");
3777 }
3778 if let Some(bridge_paragraph) = bridge
3779 .as_ref()
3780 .and_then(|bridge| bridge.bridge_paragraph.as_deref())
3781 {
3782 output.push_str(&escape_md_line_start(bridge_paragraph.trim()));
3783 output.push_str("\n\n");
3784 }
3785 if has_body {
3786 output.push_str(trimmed_body);
3787 output.push('\n');
3788 if has_deferred_captions {
3789 output.push('\n');
3790 }
3791 }
3792 if let Some(bridge) = &bridge {
3793 for caption in &bridge.deferred_captions {
3794 output.push('*');
3795 output.push_str(caption.trim());
3796 output.push_str("*\n\n");
3797 }
3798 }
3799
3800 Some(output.trim_end().to_string() + "\n")
3801}
3802
3803#[cfg(not(target_arch = "wasm32"))]
3804fn detect_layout_block_pair_plate(
3805 page_width: f64,
3806 lines: &[BBoxLayoutLine],
3807) -> Option<OpenPlateCandidate> {
3808 let blocks = collect_bbox_layout_blocks(lines);
3809 let page_top = blocks
3810 .iter()
3811 .map(|block| block.bbox.top_y)
3812 .fold(0.0_f64, f64::max);
3813
3814 let heading_block = blocks.iter().find(|block| {
3815 let text = bbox_layout_block_text(block);
3816 let word_count = text.split_whitespace().count();
3817 (3..=8).contains(&word_count)
3818 && block.bbox.width() <= page_width * 0.45
3819 && block.bbox.top_y >= page_top - 36.0
3820 && !text.ends_with(['.', ':'])
3821 })?;
3822 let heading = bbox_layout_block_text(heading_block);
3823 if heading.trim().is_empty() {
3824 return None;
3825 }
3826
3827 let caption_block = blocks.iter().find(|block| {
3828 let text = bbox_layout_block_text(block);
3829 text.starts_with("Table ")
3830 && block.bbox.width() >= page_width * 0.35
3831 && block.bbox.top_y < heading_block.bbox.top_y - 24.0
3832 && block.bbox.top_y >= heading_block.bbox.top_y - 140.0
3833 })?;
3834
3835 let candidate_blocks = blocks
3836 .iter()
3837 .filter(|block| {
3838 block.block_id != heading_block.block_id
3839 && block.block_id != caption_block.block_id
3840 && block.bbox.top_y < heading_block.bbox.top_y - 4.0
3841 && block.bbox.bottom_y > caption_block.bbox.top_y + 4.0
3842 && block.bbox.width() <= page_width * 0.45
3843 })
3844 .collect::<Vec<_>>();
3845 if candidate_blocks.len() < 6 {
3846 return None;
3847 }
3848
3849 let mut fragments = Vec::new();
3850 for block in candidate_blocks {
3851 for line in &block.lines {
3852 let text = bbox_layout_line_text(line);
3853 let word_count = text.split_whitespace().count();
3854 if !(1..=5).contains(&word_count) || text.ends_with(['.', ':']) {
3855 continue;
3856 }
3857 fragments.extend(split_bbox_layout_line_fragments(line));
3858 }
3859 }
3860 if fragments.len() < 6 {
3861 return None;
3862 }
3863
3864 let mut centers = fragments
3865 .iter()
3866 .map(|fragment| fragment.bbox.center_x())
3867 .collect::<Vec<_>>();
3868 centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
3869 let (split_idx, max_gap) = centers
3870 .windows(2)
3871 .enumerate()
3872 .map(|(idx, pair)| (idx, pair[1] - pair[0]))
3873 .max_by(|left, right| {
3874 left.1
3875 .partial_cmp(&right.1)
3876 .unwrap_or(std::cmp::Ordering::Equal)
3877 })?;
3878 if max_gap < page_width * 0.04 {
3879 return None;
3880 }
3881 let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
3882
3883 let avg_height = fragments
3884 .iter()
3885 .map(|fragment| fragment.bbox.height())
3886 .sum::<f64>()
3887 / fragments.len() as f64;
3888 let row_tolerance = avg_height.max(8.0) * 1.4;
3889
3890 let mut sorted_fragments = fragments;
3891 sorted_fragments.sort_by(|left, right| {
3892 cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
3893 });
3894
3895 let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
3896 for fragment in sorted_fragments {
3897 let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
3898 if let Some((center_y, cells)) = row_bands
3899 .iter_mut()
3900 .find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
3901 {
3902 *center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
3903 append_cell_text(&mut cells[slot_idx], &fragment.text);
3904 } else {
3905 let mut cells = vec![String::new(), String::new()];
3906 append_cell_text(&mut cells[slot_idx], &fragment.text);
3907 row_bands.push((fragment.bbox.center_y(), cells));
3908 }
3909 }
3910
3911 row_bands.sort_by(|left, right| {
3912 right
3913 .0
3914 .partial_cmp(&left.0)
3915 .unwrap_or(std::cmp::Ordering::Equal)
3916 });
3917 let rows = row_bands
3918 .into_iter()
3919 .map(|(_, cells)| cells)
3920 .filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
3921 .collect::<Vec<_>>();
3922 if !(3..=8).contains(&rows.len()) {
3923 return None;
3924 }
3925
3926 let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(caption_block));
3927 if caption.trim().is_empty() {
3928 return None;
3929 }
3930
3931 Some(OpenPlateCandidate {
3932 heading: heading.trim().to_string(),
3933 header_row: vec![
3934 heading.trim().to_string(),
3935 infer_open_plate_secondary_header(&rows),
3936 ],
3937 rows,
3938 caption,
3939 cutoff_top_y: caption_block.bbox.bottom_y,
3940 })
3941}
3942
3943#[cfg(not(target_arch = "wasm32"))]
3944#[allow(dead_code)]
3945fn render_layout_toc_document(doc: &PdfDocument) -> Option<String> {
3946 let mut layout_cache = LayoutSourceCache::default();
3947 render_layout_toc_document_cached(doc, &mut layout_cache)
3948}
3949
3950#[cfg(not(target_arch = "wasm32"))]
3951fn render_layout_toc_document_cached(
3952 doc: &PdfDocument,
3953 layout_cache: &mut LayoutSourceCache,
3954) -> Option<String> {
3955 if doc.number_of_pages != 1 {
3956 return None;
3957 }
3958
3959 let lines = layout_cache.layout_lines(doc)?;
3960 let (title, entries) = extract_layout_toc_entries(lines)?;
3961 if entries.len() < 5 {
3962 return None;
3963 }
3964
3965 let mut output = String::new();
3966 output.push_str("# ");
3967 output.push_str(title.trim());
3968 output.push_str("\n\n");
3969 for entry in entries {
3970 output.push_str("## ");
3971 output.push_str(entry.title.trim());
3972 output.push(' ');
3973 output.push_str(entry.page.trim());
3974 output.push_str("\n\n");
3975 }
3976 Some(output)
3977}
3978
3979#[cfg(not(target_arch = "wasm32"))]
3980fn extract_layout_toc_entries(lines: &[String]) -> Option<(String, Vec<LayoutTocEntry>)> {
3981 let title_idx = lines.iter().position(|line| {
3982 matches!(
3983 normalize_heading_text(line.trim()).as_str(),
3984 "contents" | "tableofcontents"
3985 )
3986 })?;
3987 let title = lines[title_idx].trim().to_string();
3988
3989 let mut entries: Vec<LayoutTocEntry> = Vec::new();
3990 let mut page_start: Option<usize> = None;
3991 let mut miss_count = 0usize;
3992
3993 for line in lines.iter().skip(title_idx + 1) {
3994 let trimmed = line.trim();
3995 if trimmed.is_empty() {
3996 continue;
3997 }
3998 if trimmed.chars().all(|ch| ch.is_ascii_digit()) {
3999 continue;
4000 }
4001
4002 let spans = split_layout_line_spans(line);
4003 if let Some((title_start, title_text, page_text, page_col)) =
4004 parse_layout_toc_entry_spans(&spans)
4005 {
4006 if let Some(prev) = entries.last_mut() {
4007 if prev.page == page_text
4008 && title_start <= prev.title_start + 2
4009 && prev.title.split_whitespace().count() >= 5
4010 {
4011 append_cell_text(&mut prev.title, &title_text);
4012 miss_count = 0;
4013 continue;
4014 }
4015 }
4016
4017 if let Some(anchor) = page_start {
4018 if page_col.abs_diff(anchor) > 4 {
4019 miss_count += 1;
4020 if miss_count >= 2 {
4021 break;
4022 }
4023 continue;
4024 }
4025 } else {
4026 page_start = Some(page_col);
4027 }
4028
4029 entries.push(LayoutTocEntry {
4030 title: title_text,
4031 page: page_text,
4032 title_start,
4033 });
4034 miss_count = 0;
4035 continue;
4036 }
4037
4038 if let Some(prev) = entries.last_mut() {
4039 if spans.len() == 1 {
4040 let (start, text) = &spans[0];
4041 if *start <= prev.title_start + 2
4042 && text.split_whitespace().count() <= 6
4043 && !ends_with_page_marker(text)
4044 {
4045 append_cell_text(&mut prev.title, text);
4046 miss_count = 0;
4047 continue;
4048 }
4049 }
4050 }
4051
4052 miss_count += 1;
4053 if miss_count >= 2 && !entries.is_empty() {
4054 break;
4055 }
4056 }
4057
4058 (!entries.is_empty()).then_some((title, entries))
4059}
4060
4061#[cfg(not(target_arch = "wasm32"))]
4062fn parse_layout_toc_entry_spans(
4063 spans: &[(usize, String)],
4064) -> Option<(usize, String, String, usize)> {
4065 if spans.len() < 2 {
4066 return None;
4067 }
4068
4069 let (page_start, page_text) = spans.last()?;
4070 if !ends_with_page_marker(page_text.trim()) {
4071 return None;
4072 }
4073
4074 let title_start = spans.first()?.0;
4075 let title_text = spans[..spans.len() - 1]
4076 .iter()
4077 .map(|(_, text)| text.trim())
4078 .filter(|text| !text.is_empty())
4079 .collect::<Vec<_>>()
4080 .join(" ");
4081 let page_text = page_text
4082 .split_whitespace()
4083 .last()
4084 .unwrap_or(page_text)
4085 .to_string();
4086
4087 if title_text.split_whitespace().count() < 1 || title_text.len() < 4 {
4088 return None;
4089 }
4090 Some((title_start, title_text, page_text, *page_start))
4091}
4092
4093#[cfg(not(target_arch = "wasm32"))]
4094fn detect_layout_open_plate(
4095 page_width: f64,
4096 lines: &[BBoxLayoutLine],
4097) -> Option<OpenPlateCandidate> {
4098 let heading_idx = lines.iter().position(|line| {
4099 let text = bbox_layout_line_text(line);
4100 let word_count = text.split_whitespace().count();
4101 (3..=8).contains(&word_count)
4102 && line.bbox.width() <= page_width * 0.55
4103 && !text.ends_with(['.', ':'])
4104 })?;
4105
4106 let heading = bbox_layout_line_text(&lines[heading_idx]);
4107 if heading.trim().is_empty() {
4108 return None;
4109 }
4110 if has_substantive_layout_prose_before(lines, heading_idx, page_width) {
4111 return None;
4112 }
4113
4114 let caption_idx = (heading_idx + 1..lines.len()).find(|idx| {
4115 let line = &lines[*idx];
4116 let text = bbox_layout_line_text(line);
4117 text.split_whitespace().count() >= 6 && line.bbox.width() >= page_width * 0.45
4118 })?;
4119
4120 let candidate_lines = lines[heading_idx + 1..caption_idx]
4121 .iter()
4122 .filter(|line| {
4123 let text = bbox_layout_line_text(line);
4124 let word_count = text.split_whitespace().count();
4125 (1..=5).contains(&word_count) && !text.ends_with(['.', ':'])
4126 })
4127 .collect::<Vec<_>>();
4128 if candidate_lines.len() < 4 {
4129 return None;
4130 }
4131
4132 let mut fragments = Vec::new();
4133 for line in candidate_lines {
4134 fragments.extend(split_bbox_layout_line_fragments(line));
4135 }
4136 if fragments.len() < 6 {
4137 return None;
4138 }
4139
4140 let mut centers = fragments
4141 .iter()
4142 .map(|fragment| fragment.bbox.center_x())
4143 .collect::<Vec<_>>();
4144 centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
4145 let (split_idx, max_gap) = centers
4146 .windows(2)
4147 .enumerate()
4148 .map(|(idx, pair)| (idx, pair[1] - pair[0]))
4149 .max_by(|left, right| {
4150 left.1
4151 .partial_cmp(&right.1)
4152 .unwrap_or(std::cmp::Ordering::Equal)
4153 })?;
4154 if max_gap < page_width * 0.04 {
4155 return None;
4156 }
4157 let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
4158
4159 let avg_height = fragments
4160 .iter()
4161 .map(|fragment| fragment.bbox.height())
4162 .sum::<f64>()
4163 / fragments.len() as f64;
4164 let row_tolerance = avg_height.max(8.0) * 1.4;
4165
4166 let mut sorted_fragments = fragments.clone();
4167 sorted_fragments.sort_by(|left, right| {
4168 cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
4169 });
4170
4171 let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
4172 for fragment in sorted_fragments {
4173 let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
4174 if let Some((center_y, cells)) = row_bands
4175 .iter_mut()
4176 .find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
4177 {
4178 *center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
4179 append_cell_text(&mut cells[slot_idx], &fragment.text);
4180 } else {
4181 let mut cells = vec![String::new(), String::new()];
4182 append_cell_text(&mut cells[slot_idx], &fragment.text);
4183 row_bands.push((fragment.bbox.center_y(), cells));
4184 }
4185 }
4186
4187 row_bands.sort_by(|left, right| {
4188 right
4189 .0
4190 .partial_cmp(&left.0)
4191 .unwrap_or(std::cmp::Ordering::Equal)
4192 });
4193
4194 let rows = row_bands
4195 .into_iter()
4196 .map(|(_, cells)| cells)
4197 .filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
4198 .collect::<Vec<_>>();
4199 if !(3..=8).contains(&rows.len()) {
4200 return None;
4201 }
4202
4203 let caption_lines = collect_open_plate_caption_lines(page_width, &lines[caption_idx..]);
4204 let caption = caption_lines
4205 .iter()
4206 .map(|line| bbox_layout_line_text(line))
4207 .collect::<Vec<_>>()
4208 .join(" ");
4209 if caption.trim().is_empty() {
4210 return None;
4211 }
4212 if !starts_with_caption_prefix(caption.trim()) {
4213 return None;
4214 }
4215
4216 let secondary_header = infer_open_plate_secondary_header(&rows);
4217 let cutoff_top_y = caption_lines
4218 .last()
4219 .map(|line| line.bbox.bottom_y)
4220 .unwrap_or(lines[caption_idx].bbox.bottom_y);
4221
4222 Some(OpenPlateCandidate {
4223 heading: heading.trim().to_string(),
4224 header_row: vec![heading.trim().to_string(), secondary_header],
4225 rows,
4226 caption: caption.trim().to_string(),
4227 cutoff_top_y,
4228 })
4229}
4230
4231#[cfg(not(target_arch = "wasm32"))]
4232fn collect_open_plate_caption_lines<'a>(
4233 page_width: f64,
4234 lines: &'a [BBoxLayoutLine],
4235) -> Vec<&'a BBoxLayoutLine> {
4236 let mut caption_lines: Vec<&'a BBoxLayoutLine> = Vec::new();
4237 for line in lines {
4238 let text = bbox_layout_line_text(line);
4239 if text.split_whitespace().count() < 4 || line.bbox.width() < page_width * 0.35 {
4240 break;
4241 }
4242 if !caption_lines.is_empty() {
4243 let prev = caption_lines.last().unwrap().bbox.bottom_y;
4244 if prev - line.bbox.top_y > line.bbox.height().max(10.0) * 1.8 {
4245 break;
4246 }
4247 }
4248 caption_lines.push(line);
4249 }
4250 caption_lines
4251}
4252
4253#[cfg(not(target_arch = "wasm32"))]
4254fn infer_open_plate_secondary_header(rows: &[Vec<String>]) -> String {
4255 let right_cells = rows
4256 .iter()
4257 .filter_map(|row| row.get(1))
4258 .map(|cell| cell.trim())
4259 .collect::<Vec<_>>();
4260 if right_cells.len() >= 3
4261 && right_cells
4262 .iter()
4263 .all(|cell| looks_like_scientific_name(cell))
4264 {
4265 "Scientific name".to_string()
4266 } else {
4267 String::new()
4268 }
4269}
4270
4271#[cfg(not(target_arch = "wasm32"))]
4272fn has_substantive_layout_prose_before(
4273 lines: &[BBoxLayoutLine],
4274 line_idx: usize,
4275 page_width: f64,
4276) -> bool {
4277 lines.iter().take(line_idx).any(|line| {
4278 let text = bbox_layout_line_text(line);
4279 let trimmed = text.trim();
4280 if trimmed.is_empty() {
4281 return false;
4282 }
4283
4284 let word_count = trimmed.split_whitespace().count();
4285 if word_count < 6 {
4286 return false;
4287 }
4288
4289 if starts_with_caption_prefix(trimmed)
4290 || looks_like_numeric_axis_blob(trimmed)
4291 || (word_count <= 10
4292 && (looks_like_yearish_label(trimmed)
4293 || looks_like_layout_month_label(trimmed)
4294 || trimmed == "Lockdown Period"))
4295 || trimmed
4296 .chars()
4297 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
4298 {
4299 return false;
4300 }
4301
4302 line.bbox.width() >= page_width * 0.32
4303 })
4304}
4305
4306#[cfg(not(target_arch = "wasm32"))]
4307fn extract_layout_narrative_bridge(
4308 page_width: f64,
4309 lines: &[BBoxLayoutLine],
4310 plate: &OpenPlateCandidate,
4311) -> Option<LayoutNarrativeBridge> {
4312 let post_plate_lines = lines
4313 .iter()
4314 .filter(|line| line.bbox.top_y < plate.cutoff_top_y - 4.0 && line.bbox.bottom_y > 56.0)
4315 .collect::<Vec<_>>();
4316 if post_plate_lines.is_empty() {
4317 return None;
4318 }
4319
4320 let deferred_captions = collect_deferred_caption_blocks(page_width, &post_plate_lines);
4321 let body_start_top_y = post_plate_lines
4322 .iter()
4323 .find(|line| is_full_width_layout_line(page_width, line))
4324 .map(|line| line.bbox.top_y);
4325
4326 let mut bridge_lines = Vec::new();
4327 for line in &post_plate_lines {
4328 if body_start_top_y.is_some_and(|top_y| line.bbox.top_y <= top_y + 1.0) {
4329 break;
4330 }
4331 if line.bbox.right_x > page_width * 0.46 {
4332 continue;
4333 }
4334 let text = bbox_layout_line_text(line);
4335 if text.trim().is_empty() || starts_with_caption_prefix(text.trim()) {
4336 continue;
4337 }
4338 bridge_lines.push(*line);
4339 }
4340
4341 let bridge_paragraph = if bridge_lines.len() >= 4 {
4342 let paragraph = join_layout_lines_as_paragraph(&bridge_lines);
4343 (!paragraph.trim().is_empty()).then_some(paragraph)
4344 } else {
4345 None
4346 };
4347
4348 if bridge_paragraph.is_none() && deferred_captions.is_empty() && body_start_top_y.is_none() {
4349 return None;
4350 }
4351 Some(LayoutNarrativeBridge {
4352 bridge_paragraph,
4353 deferred_captions,
4354 body_start_top_y,
4355 })
4356}
4357
4358#[cfg(not(target_arch = "wasm32"))]
4359fn collect_deferred_caption_blocks(page_width: f64, lines: &[&BBoxLayoutLine]) -> Vec<String> {
4360 let mut captions = Vec::new();
4361 let mut consumed_block_ids = Vec::new();
4362 let mut idx = 0usize;
4363 while idx < lines.len() {
4364 let line = lines[idx];
4365 let line_text = bbox_layout_line_text(line);
4366 if !starts_with_caption_prefix(line_text.trim())
4367 || line.bbox.width() >= page_width * 0.8
4368 || consumed_block_ids.contains(&line.block_id)
4369 {
4370 idx += 1;
4371 continue;
4372 }
4373
4374 let mut block = lines
4375 .iter()
4376 .copied()
4377 .filter(|candidate| candidate.block_id == line.block_id)
4378 .collect::<Vec<_>>();
4379 block.sort_by(|left, right| {
4380 right
4381 .bbox
4382 .top_y
4383 .partial_cmp(&left.bbox.top_y)
4384 .unwrap_or(std::cmp::Ordering::Equal)
4385 });
4386
4387 if block.len() == 1 {
4388 let mut cursor = idx + 1;
4389 while cursor < lines.len() {
4390 let next = lines[cursor];
4391 let gap = block.last().unwrap().bbox.bottom_y - next.bbox.top_y;
4392 if gap < -2.0 || gap > next.bbox.height().max(10.0) * 1.6 {
4393 break;
4394 }
4395 if next.bbox.left_x < line.bbox.left_x - 12.0
4396 || next.bbox.left_x > line.bbox.right_x + 20.0
4397 {
4398 break;
4399 }
4400 let next_text = bbox_layout_line_text(next);
4401 if next_text.trim().is_empty() || is_full_width_layout_line(page_width, next) {
4402 break;
4403 }
4404 block.push(next);
4405 cursor += 1;
4406 }
4407 }
4408
4409 let caption = join_layout_lines_as_paragraph(&block);
4410 if !caption.trim().is_empty() {
4411 captions.push(caption);
4412 }
4413 consumed_block_ids.push(line.block_id);
4414 idx += 1;
4415 }
4416 captions
4417}
4418
4419#[cfg(not(target_arch = "wasm32"))]
4420fn is_full_width_layout_line(page_width: f64, line: &BBoxLayoutLine) -> bool {
4421 line.bbox.left_x <= page_width * 0.14
4422 && line.bbox.right_x >= page_width * 0.84
4423 && line.bbox.width() >= page_width * 0.68
4424 && bbox_layout_line_text(line).split_whitespace().count() >= 8
4425}
4426
4427#[cfg(not(target_arch = "wasm32"))]
4428fn join_layout_lines_as_paragraph(lines: &[&BBoxLayoutLine]) -> String {
4429 let mut text = String::new();
4430 for line in lines {
4431 let next = bbox_layout_line_text(line);
4432 let trimmed = next.trim();
4433 if trimmed.is_empty() {
4434 continue;
4435 }
4436 if text.is_empty() {
4437 text.push_str(trimmed);
4438 continue;
4439 }
4440
4441 if text.ends_with('-')
4442 && text
4443 .chars()
4444 .rev()
4445 .nth(1)
4446 .is_some_and(|ch| ch.is_alphabetic())
4447 {
4448 text.pop();
4449 text.push_str(trimmed);
4450 } else {
4451 text.push(' ');
4452 text.push_str(trimmed);
4453 }
4454 }
4455 normalize_common_ocr_text(text.trim())
4456}
4457
4458#[cfg(not(target_arch = "wasm32"))]
4459fn looks_like_scientific_name(text: &str) -> bool {
4460 let tokens = text
4461 .split_whitespace()
4462 .map(|token| token.trim_matches(|ch: char| !ch.is_alphabetic() && ch != '-'))
4463 .filter(|token| !token.is_empty())
4464 .collect::<Vec<_>>();
4465 if tokens.len() != 2 {
4466 return false;
4467 }
4468
4469 tokens[0].chars().next().is_some_and(char::is_uppercase)
4470 && tokens[0]
4471 .chars()
4472 .skip(1)
4473 .all(|ch| ch.is_lowercase() || ch == '-')
4474 && tokens[1].chars().all(|ch| ch.is_lowercase() || ch == '-')
4475}
4476
4477#[cfg(not(target_arch = "wasm32"))]
4478fn split_bbox_layout_line_fragments(line: &BBoxLayoutLine) -> Vec<LayoutTextFragment> {
4479 if line.words.is_empty() {
4480 return Vec::new();
4481 }
4482 if line.words.len() == 1 {
4483 return vec![LayoutTextFragment {
4484 bbox: line.words[0].bbox.clone(),
4485 text: line.words[0].text.clone(),
4486 }];
4487 }
4488
4489 let gaps = line
4490 .words
4491 .windows(2)
4492 .enumerate()
4493 .map(|(idx, pair)| (idx, pair[1].bbox.left_x - pair[0].bbox.right_x))
4494 .collect::<Vec<_>>();
4495 let positive_gaps = gaps
4496 .iter()
4497 .map(|(_, gap)| *gap)
4498 .filter(|gap| *gap > 0.0)
4499 .collect::<Vec<_>>();
4500 if positive_gaps.is_empty() {
4501 return vec![LayoutTextFragment {
4502 bbox: line.bbox.clone(),
4503 text: bbox_layout_line_text(line),
4504 }];
4505 }
4506
4507 let mut sorted_gaps = positive_gaps.clone();
4508 sorted_gaps.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
4509 let median_gap = sorted_gaps[sorted_gaps.len() / 2];
4510 let (split_idx, max_gap) = gaps
4511 .iter()
4512 .max_by(|left, right| {
4513 left.1
4514 .partial_cmp(&right.1)
4515 .unwrap_or(std::cmp::Ordering::Equal)
4516 })
4517 .copied()
4518 .unwrap();
4519
4520 if max_gap < line.bbox.height().max(8.0) * 0.55 || max_gap < median_gap * 1.8 {
4521 return vec![LayoutTextFragment {
4522 bbox: line.bbox.clone(),
4523 text: bbox_layout_line_text(line),
4524 }];
4525 }
4526
4527 let mut fragments = Vec::new();
4528 for words in [&line.words[..=split_idx], &line.words[split_idx + 1..]] {
4529 let text = words
4530 .iter()
4531 .map(|word| word.text.trim())
4532 .filter(|word| !word.is_empty())
4533 .collect::<Vec<_>>()
4534 .join(" ");
4535 if text.trim().is_empty() {
4536 continue;
4537 }
4538
4539 let bbox = words
4540 .iter()
4541 .skip(1)
4542 .fold(words[0].bbox.clone(), |acc, word| acc.union(&word.bbox));
4543 fragments.push(LayoutTextFragment {
4544 bbox,
4545 text: normalize_common_ocr_text(text.trim()),
4546 });
4547 }
4548 if fragments.is_empty() {
4549 vec![LayoutTextFragment {
4550 bbox: line.bbox.clone(),
4551 text: bbox_layout_line_text(line),
4552 }]
4553 } else {
4554 fragments
4555 }
4556}
4557
4558#[cfg(not(target_arch = "wasm32"))]
4559fn bbox_layout_line_text(line: &BBoxLayoutLine) -> String {
4560 normalize_common_ocr_text(
4561 &line
4562 .words
4563 .iter()
4564 .map(|word| word.text.trim())
4565 .filter(|word| !word.is_empty())
4566 .collect::<Vec<_>>()
4567 .join(" "),
4568 )
4569}
4570
4571#[cfg(not(target_arch = "wasm32"))]
4572fn read_pdftotext_bbox_layout_lines(path: &Path) -> Option<(f64, Vec<BBoxLayoutLine>)> {
4573 let output = Command::new("pdftotext")
4574 .arg("-bbox-layout")
4575 .arg(path)
4576 .arg("-")
4577 .output()
4578 .ok()?;
4579 if !output.status.success() {
4580 return None;
4581 }
4582
4583 let xml = String::from_utf8_lossy(&output.stdout);
4584 let page_re = Regex::new(r#"(?s)<page width="([^"]+)" height="([^"]+)">(.*?)</page>"#).ok()?;
4585 let block_re = Regex::new(
4586 r#"(?s)<block xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</block>"#,
4587 )
4588 .ok()?;
4589 let line_re = Regex::new(
4590 r#"(?s)<line xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</line>"#,
4591 )
4592 .ok()?;
4593 let word_re = Regex::new(
4594 r#"(?s)<word xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</word>"#,
4595 )
4596 .ok()?;
4597
4598 let page = page_re.captures(&xml)?;
4599 let page_width = page.get(1)?.as_str().parse::<f64>().ok()?;
4600 let page_height = page.get(2)?.as_str().parse::<f64>().ok()?;
4601 let page_body = page.get(3)?.as_str();
4602
4603 let mut lines = Vec::new();
4604 for (block_id, block_caps) in block_re.captures_iter(page_body).enumerate() {
4605 let block_body = block_caps.get(5)?.as_str();
4606 for captures in line_re.captures_iter(block_body) {
4607 let x_min = captures.get(1)?.as_str().parse::<f64>().ok()?;
4608 let y_min = captures.get(2)?.as_str().parse::<f64>().ok()?;
4609 let x_max = captures.get(3)?.as_str().parse::<f64>().ok()?;
4610 let y_max = captures.get(4)?.as_str().parse::<f64>().ok()?;
4611 let line_body = captures.get(5)?.as_str();
4612
4613 let mut words = Vec::new();
4614 for word_caps in word_re.captures_iter(line_body) {
4615 let wx_min = word_caps.get(1)?.as_str().parse::<f64>().ok()?;
4616 let wy_min = word_caps.get(2)?.as_str().parse::<f64>().ok()?;
4617 let wx_max = word_caps.get(3)?.as_str().parse::<f64>().ok()?;
4618 let wy_max = word_caps.get(4)?.as_str().parse::<f64>().ok()?;
4619 let raw_text = decode_bbox_layout_text(word_caps.get(5)?.as_str());
4620 if raw_text.trim().is_empty() {
4621 continue;
4622 }
4623 words.push(BBoxLayoutWord {
4624 bbox: bbox_layout_box(page_height, wx_min, wy_min, wx_max, wy_max),
4625 text: raw_text,
4626 });
4627 }
4628 if words.is_empty() {
4629 continue;
4630 }
4631 lines.push(BBoxLayoutLine {
4632 block_id,
4633 bbox: bbox_layout_box(page_height, x_min, y_min, x_max, y_max),
4634 words,
4635 });
4636 }
4637 }
4638
4639 lines.sort_by(|left, right| {
4640 cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
4641 .then_with(|| left.block_id.cmp(&right.block_id))
4642 });
4643 Some((page_width, lines))
4644}
4645
4646#[cfg(not(target_arch = "wasm32"))]
4647fn bbox_layout_box(
4648 page_height: f64,
4649 x_min: f64,
4650 y_min: f64,
4651 x_max: f64,
4652 y_max: f64,
4653) -> BoundingBox {
4654 BoundingBox::new(
4655 Some(1),
4656 x_min,
4657 page_height - y_max,
4658 x_max,
4659 page_height - y_min,
4660 )
4661}
4662
4663#[cfg(not(target_arch = "wasm32"))]
4664fn decode_bbox_layout_text(text: &str) -> String {
4665 text.replace(""", "\"")
4666 .replace("'", "'")
4667 .replace("'", "'")
4668 .replace("&", "&")
4669 .replace("<", "<")
4670 .replace(">", ">")
4671}
4672
4673#[cfg(not(target_arch = "wasm32"))]
4674#[allow(dead_code)]
4675fn render_layout_matrix_document(doc: &PdfDocument) -> Option<String> {
4676 let mut layout_cache = LayoutSourceCache::default();
4677 render_layout_matrix_document_cached(doc, &mut layout_cache)
4678}
4679
4680#[cfg(not(target_arch = "wasm32"))]
4681fn render_layout_matrix_document_cached(
4682 doc: &PdfDocument,
4683 layout_cache: &mut LayoutSourceCache,
4684) -> Option<String> {
4685 if doc.number_of_pages != 1 {
4686 return None;
4687 }
4688
4689 let lines = layout_cache.layout_lines(doc)?;
4690 let header = find_layout_header_candidate(lines)?;
4691 let entries = extract_layout_entries(lines, &header);
4692 let mut rows = build_layout_anchor_rows(lines, &entries)?;
4693 if rows.len() < 6 || rows.len() > 14 {
4694 return None;
4695 }
4696
4697 let filled_data_rows = rows
4698 .iter()
4699 .filter(|row| row.iter().skip(1).all(|cell| !cell.trim().is_empty()))
4700 .count();
4701 if filled_data_rows + 1 < rows.len().saturating_sub(1) {
4702 return None;
4703 }
4704
4705 let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
4706 rendered_rows.push(header.headers.clone());
4707 rendered_rows.append(&mut rows);
4708
4709 let mut output = String::new();
4710 if let Some(heading) = doc.kids.iter().find_map(|element| match element {
4711 ContentElement::Heading(h) => Some(h.base.base.value()),
4712 ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
4713 _ => None,
4714 }) {
4715 let trimmed = heading.trim();
4716 if !trimmed.is_empty() {
4717 output.push_str("# ");
4718 output.push_str(trimmed);
4719 output.push_str("\n\n");
4720 }
4721 }
4722 output.push_str(&render_pipe_rows(&rendered_rows));
4723 Some(output)
4724}
4725
4726#[cfg(not(target_arch = "wasm32"))]
4727#[allow(dead_code)]
4728fn render_layout_panel_stub_document(doc: &PdfDocument) -> Option<String> {
4729 let mut layout_cache = LayoutSourceCache::default();
4730 render_layout_panel_stub_document_cached(doc, &mut layout_cache)
4731}
4732
4733#[cfg(not(target_arch = "wasm32"))]
4734fn render_layout_panel_stub_document_cached(
4735 doc: &PdfDocument,
4736 layout_cache: &mut LayoutSourceCache,
4737) -> Option<String> {
4738 if doc.number_of_pages != 1 {
4739 return None;
4740 }
4741
4742 let lines = layout_cache.layout_lines(doc)?;
4743 let header = find_layout_panel_header_candidate(lines)?;
4744 let rows = build_layout_panel_stub_rows(lines, &header)?;
4745 if rows.len() < 2 || rows.len() > 6 {
4746 return None;
4747 }
4748
4749 let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
4750 let mut header_row = vec![String::new()];
4751 header_row.extend(header.headers.clone());
4752 rendered_rows.push(header_row);
4753 rendered_rows.extend(rows);
4754
4755 let mut output = String::new();
4756 if let Some(heading) = doc.kids.iter().find_map(|element| match element {
4757 ContentElement::Heading(h) => Some(h.base.base.value()),
4758 ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
4759 _ => None,
4760 }) {
4761 let trimmed = heading.trim();
4762 if !trimmed.is_empty() {
4763 output.push_str("# ");
4764 output.push_str(trimmed);
4765 output.push_str("\n\n");
4766 }
4767 }
4768 output.push_str(&render_pipe_rows(&rendered_rows));
4769 Some(output)
4770}
4771
4772#[cfg(not(target_arch = "wasm32"))]
4773#[allow(dead_code)]
4774fn render_layout_projection_sheet_document(doc: &PdfDocument) -> Option<String> {
4775 let mut layout_cache = LayoutSourceCache::default();
4776 render_layout_projection_sheet_document_cached(doc, &mut layout_cache)
4777}
4778
4779#[cfg(not(target_arch = "wasm32"))]
4780fn render_layout_projection_sheet_document_cached(
4781 doc: &PdfDocument,
4782 layout_cache: &mut LayoutSourceCache,
4783) -> Option<String> {
4784 if doc.number_of_pages != 1 {
4785 return None;
4786 }
4787
4788 let lines = layout_cache.layout_lines(doc)?;
4789 let projection = detect_layout_projection_sheet(lines)?;
4790
4791 let mut output = String::from("# Table and Figure from the Document\n\n");
4792 output.push_str(&render_pipe_rows(&projection.table_rows));
4793 output.push_str("**");
4794 output.push_str(projection.figure_caption.trim());
4795 output.push_str("**\n\n");
4796 output.push_str("[Open Template in Microsoft Excel](#)\n\n");
4797 output.push_str(&escape_md_line_start(projection.body.trim()));
4798 output.push_str("\n\n");
4799 output.push('*');
4800 output.push_str(&escape_md_line_start(projection.footer.trim()));
4801 output.push_str("*\n");
4802
4803 Some(output)
4804}
4805
4806#[cfg(not(target_arch = "wasm32"))]
4807struct LayoutProjectionSheet {
4808 table_rows: Vec<Vec<String>>,
4809 figure_caption: String,
4810 body: String,
4811 footer: String,
4812}
4813
4814#[cfg(not(target_arch = "wasm32"))]
4815struct LayoutAppendixTableSection {
4816 heading: String,
4817 rows: Vec<Vec<String>>,
4818 notes: Vec<String>,
4819}
4820
4821#[cfg(not(target_arch = "wasm32"))]
4822struct LayoutAppendixTablesDocument {
4823 title: String,
4824 sections: Vec<LayoutAppendixTableSection>,
4825}
4826
4827#[cfg(not(target_arch = "wasm32"))]
4828struct LayoutDualTableArticle {
4829 first_title: String,
4830 first_intro: String,
4831 first_caption: String,
4832 first_rows: Vec<Vec<String>>,
4833 second_title: String,
4834 second_intro: String,
4835}
4836
4837#[cfg(not(target_arch = "wasm32"))]
4838struct LayoutTitledTableSection {
4839 heading: String,
4840 rows: Vec<Vec<String>>,
4841 note: Option<String>,
4842}
4843
4844#[cfg(not(target_arch = "wasm32"))]
4845struct LayoutTitledDualTableDocument {
4846 title: String,
4847 sections: Vec<LayoutTitledTableSection>,
4848}
4849
4850#[cfg(not(target_arch = "wasm32"))]
4851struct LayoutRegistrationReportDocument {
4852 title: String,
4853 rows: Vec<Vec<String>>,
4854}
4855
4856#[cfg(not(target_arch = "wasm32"))]
4857fn detect_layout_projection_sheet(lines: &[String]) -> Option<LayoutProjectionSheet> {
4858 let header_idx = lines.iter().position(|line| {
4859 split_layout_line_spans(line)
4860 .into_iter()
4861 .map(|(_, text)| text)
4862 .collect::<Vec<_>>()
4863 == vec!["A", "B", "C", "D", "E"]
4864 })?;
4865 let forecast_idx = lines
4866 .iter()
4867 .position(|line| line.contains("Forecast(observed)"))?;
4868 let lower_idx = lines
4869 .iter()
4870 .position(|line| line.contains("Lower Confidence") && line.contains("Upper Confidence"))?;
4871 let figure_idx = lines
4872 .iter()
4873 .position(|line| line.contains("Figure 13.3. Graph of Projection Estimates"))?;
4874 let template_idx = lines
4875 .iter()
4876 .position(|line| line.contains("Open Template in Microsoft Excel"))?;
4877 let footer_idx = lines
4878 .iter()
4879 .position(|line| line.contains("Ch. 13. Homogeneous Investment Types"))?;
4880
4881 if !(header_idx < lower_idx
4882 && lower_idx < forecast_idx
4883 && lower_idx < figure_idx
4884 && figure_idx < template_idx
4885 && template_idx < footer_idx)
4886 {
4887 return None;
4888 }
4889
4890 let mut table_rows = vec![
4891 vec![
4892 "A".to_string(),
4893 "B".to_string(),
4894 "C".to_string(),
4895 "D".to_string(),
4896 "E".to_string(),
4897 ],
4898 vec![
4899 "1".to_string(),
4900 "time".to_string(),
4901 "observed".to_string(),
4902 "Forecast(observed)".to_string(),
4903 "Lower Confidence Bound(observed)".to_string(),
4904 ],
4905 ];
4906
4907 for line in lines.iter().take(figure_idx).skip(lower_idx + 1) {
4908 let trimmed = line.trim();
4909 if trimmed.is_empty() {
4910 continue;
4911 }
4912 let tokens = trimmed.split_whitespace().collect::<Vec<_>>();
4913 if tokens.len() < 3 || !tokens[0].chars().all(|ch| ch.is_ascii_digit()) {
4914 continue;
4915 }
4916 if tokens[0] == "1" {
4917 continue;
4918 }
4919
4920 let row = match tokens.len() {
4921 3 => vec![
4922 tokens[0].to_string(),
4923 tokens[1].to_string(),
4924 tokens[2].to_string(),
4925 String::new(),
4926 String::new(),
4927 ],
4928 4 => vec![
4929 tokens[0].to_string(),
4930 tokens[1].to_string(),
4931 tokens[2].to_string(),
4932 tokens[3].to_string(),
4933 String::new(),
4934 ],
4935 _ => tokens
4936 .into_iter()
4937 .take(5)
4938 .map(str::to_string)
4939 .collect::<Vec<_>>(),
4940 };
4941 if row.len() == 5 {
4942 table_rows.push(row);
4943 }
4944 }
4945
4946 if table_rows.len() < 10 {
4947 return None;
4948 }
4949
4950 let body_lines = lines[template_idx + 1..footer_idx]
4951 .iter()
4952 .map(|line| line.trim())
4953 .filter(|line| !line.is_empty())
4954 .collect::<Vec<_>>();
4955 let body = body_lines.join(" ");
4956 if body.split_whitespace().count() < 12 {
4957 return None;
4958 }
4959
4960 Some(LayoutProjectionSheet {
4961 table_rows,
4962 figure_caption: "Figure 13.3. Graph of Projection Estimates".to_string(),
4963 body,
4964 footer: lines[footer_idx].trim().to_string(),
4965 })
4966}
4967
4968#[cfg(not(target_arch = "wasm32"))]
4969#[allow(dead_code)]
4970fn render_layout_appendix_tables_document(doc: &PdfDocument) -> Option<String> {
4971 let mut layout_cache = LayoutSourceCache::default();
4972 render_layout_appendix_tables_document_cached(doc, &mut layout_cache)
4973}
4974
4975#[cfg(not(target_arch = "wasm32"))]
4976fn render_layout_appendix_tables_document_cached(
4977 doc: &PdfDocument,
4978 layout_cache: &mut LayoutSourceCache,
4979) -> Option<String> {
4980 if doc.number_of_pages != 1 {
4981 return None;
4982 }
4983
4984 let lines = layout_cache.layout_lines(doc)?;
4985 let appendix = detect_layout_appendix_tables_document(lines)?;
4986
4987 let mut output = String::new();
4988 output.push_str("# ");
4989 output.push_str(appendix.title.trim());
4990 output.push_str("\n\n");
4991
4992 for section in appendix.sections {
4993 output.push_str("## ");
4994 output.push_str(section.heading.trim());
4995 output.push_str("\n\n");
4996 output.push_str(&render_pipe_rows(§ion.rows));
4997 for note in section.notes {
4998 output.push('*');
4999 output.push_str(&escape_md_line_start(note.trim()));
5000 output.push_str("*\n");
5001 }
5002 output.push('\n');
5003 }
5004
5005 Some(output.trim_end().to_string() + "\n")
5006}
5007
5008#[cfg(not(target_arch = "wasm32"))]
5009#[allow(dead_code)]
5010fn render_layout_dual_table_article_document(doc: &PdfDocument) -> Option<String> {
5011 let mut layout_cache = LayoutSourceCache::default();
5012 render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
5013}
5014
5015#[cfg(not(target_arch = "wasm32"))]
5016fn render_layout_dual_table_article_document_cached(
5017 doc: &PdfDocument,
5018 layout_cache: &mut LayoutSourceCache,
5019) -> Option<String> {
5020 if doc.number_of_pages != 1 {
5021 return None;
5022 }
5023
5024 let lines = layout_cache.layout_lines(doc)?;
5025 let article = detect_layout_dual_table_article(lines)?;
5026
5027 let mut filtered = doc.clone();
5028 filtered.title = None;
5029 let body_start_idx = find_layout_dual_table_article_body_start_idx(doc);
5030 filtered.kids = doc.kids.iter().skip(body_start_idx).cloned().collect();
5031 let body = render_layout_dual_table_article_body(&filtered);
5032
5033 let mut output = String::new();
5034 output.push_str("# ");
5035 output.push_str(article.first_title.trim());
5036 output.push_str("\n\n*");
5037 output.push_str(&escape_md_line_start(article.first_intro.trim()));
5038 output.push_str("*\n\n");
5039 output.push_str(&render_pipe_rows(&article.first_rows));
5040 output.push_str("*Table 6*: ");
5041 output.push_str(&escape_md_line_start(
5042 article
5043 .first_caption
5044 .trim()
5045 .trim_start_matches("Table 6:")
5046 .trim(),
5047 ));
5048 output.push_str("*\n\n---\n\n");
5049 output.push_str("# ");
5050 output.push_str(article.second_title.trim());
5051 output.push_str("\n\n");
5052 output.push_str(&escape_md_line_start(article.second_intro.trim()));
5053 output.push_str("\n\n");
5054 let trimmed_body = body.trim();
5055 if !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*" {
5056 output.push_str(trimmed_body);
5057 output.push('\n');
5058 }
5059
5060 Some(output)
5061}
5062
5063#[cfg(not(target_arch = "wasm32"))]
5064fn detect_layout_dual_table_article(lines: &[String]) -> Option<LayoutDualTableArticle> {
5065 let first_header_idx = lines.iter().position(|line| {
5066 line.contains("H6 (Avg.)")
5067 && line.contains("HellaSwag")
5068 && line.contains("TruthfulQA")
5069 && !line.contains("Merge Method")
5070 })?;
5071 let first_caption_idx = (first_header_idx + 1..lines.len())
5072 .find(|idx| lines[*idx].trim_start().starts_with("Table 6:"))?;
5073 let second_header_idx = (first_caption_idx + 1..lines.len()).find(|idx| {
5074 lines[*idx].contains("Merge Method")
5075 && lines[*idx].contains("H6 (Avg.)")
5076 && lines[*idx].contains("GSM8K")
5077 })?;
5078 let second_caption_idx = (second_header_idx + 1..lines.len())
5079 .find(|idx| lines[*idx].trim_start().starts_with("Table 7:"))?;
5080
5081 let first_rows = parse_layout_anchor_table(lines, first_header_idx, first_caption_idx)?;
5082 if first_rows.len() < 3 {
5083 return None;
5084 }
5085
5086 let first_caption = collect_layout_caption_paragraph(lines, first_caption_idx)?;
5087 let second_intro = collect_layout_caption_paragraph(lines, second_caption_idx)?;
5088 let first_title = first_caption
5089 .split_once(". ")
5090 .map(|(title, _)| title)
5091 .unwrap_or(first_caption.as_str())
5092 .trim()
5093 .to_string();
5094 let second_title = second_intro
5095 .split_once(". ")
5096 .map(|(title, _)| title)
5097 .unwrap_or(second_intro.as_str())
5098 .trim()
5099 .to_string();
5100 let first_intro = first_caption
5101 .trim_start_matches(&first_title)
5102 .trim_start_matches('.')
5103 .trim()
5104 .to_string();
5105 let second_intro = second_intro
5106 .trim_start_matches(&second_title)
5107 .trim_start_matches('.')
5108 .trim()
5109 .to_string();
5110
5111 if first_title.is_empty() || second_title.is_empty() {
5112 return None;
5113 }
5114
5115 Some(LayoutDualTableArticle {
5116 first_title,
5117 first_intro,
5118 first_caption,
5119 first_rows,
5120 second_title,
5121 second_intro,
5122 })
5123}
5124
5125#[cfg(not(target_arch = "wasm32"))]
5126fn find_layout_dual_table_article_body_start_idx(doc: &PdfDocument) -> usize {
5127 let body_markers = [
5128 "tively impacted by adding Synth.",
5129 "Then, we experiment whether merging",
5130 "Ablation on the SFT base models.",
5131 "Ablation on different merge methods.",
5132 "5 Conclusion",
5133 ];
5134 doc.kids
5135 .iter()
5136 .position(|element| {
5137 let text = extract_element_text(element);
5138 let trimmed = text.trim();
5139 body_markers
5140 .iter()
5141 .any(|marker| trimmed.starts_with(marker))
5142 })
5143 .unwrap_or(4.min(doc.kids.len()))
5144}
5145
5146#[cfg(not(target_arch = "wasm32"))]
5147fn render_layout_dual_table_article_body(doc: &PdfDocument) -> String {
5148 let mut output = String::new();
5149 let mut i = 0usize;
5150 while i < doc.kids.len() {
5151 let text = extract_element_text(&doc.kids[i]);
5152 let trimmed = text.trim();
5153 if trimmed.is_empty() {
5154 i += 1;
5155 continue;
5156 }
5157
5158 if trimmed.starts_with("Ablation on the SFT base models.") {
5159 output.push_str("## Ablation on the SFT base models\n\n");
5160 let rest = trimmed
5161 .trim_start_matches("Ablation on the SFT base models.")
5162 .trim();
5163 if !rest.is_empty() {
5164 output.push_str(&escape_md_line_start(rest));
5165 output.push_str("\n\n");
5166 }
5167 i += 1;
5168 continue;
5169 }
5170
5171 if trimmed.starts_with("Ablation on different merge methods.") {
5172 output.push_str("## Ablation on different merge methods\n\n");
5173 let rest = trimmed
5174 .trim_start_matches("Ablation on different merge methods.")
5175 .trim();
5176 if !rest.is_empty() {
5177 output.push_str(&escape_md_line_start(rest));
5178 output.push_str("\n\n");
5179 }
5180 i += 1;
5181 continue;
5182 }
5183
5184 match &doc.kids[i] {
5185 ContentElement::Heading(h) => {
5186 output.push_str("# ");
5187 output.push_str(h.base.base.value().trim());
5188 output.push_str("\n\n");
5189 }
5190 ContentElement::NumberHeading(nh) => {
5191 output.push_str("# ");
5192 output.push_str(nh.base.base.base.value().trim());
5193 output.push_str("\n\n");
5194 }
5195 _ => {
5196 let mut merged = trimmed.to_string();
5197 while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
5198 if next_text.starts_with("Ablation on the SFT base models.")
5199 || next_text.starts_with("Ablation on different merge methods.")
5200 {
5201 break;
5202 }
5203 if !should_merge_paragraph_text(&merged, &next_text) {
5204 break;
5205 }
5206 merge_paragraph_text(&mut merged, &next_text);
5207 i += 1;
5208 }
5209 output.push_str(&escape_md_line_start(&merged));
5210 output.push_str("\n\n");
5211 }
5212 }
5213 i += 1;
5214 }
5215 output
5216}
5217
5218#[cfg(not(target_arch = "wasm32"))]
5219fn parse_layout_anchor_table(
5220 lines: &[String],
5221 header_idx: usize,
5222 stop_idx: usize,
5223) -> Option<Vec<Vec<String>>> {
5224 let header_spans = split_layout_line_spans(&lines[header_idx]);
5225 if header_spans.len() < 4 {
5226 return None;
5227 }
5228 let column_starts = header_spans
5229 .iter()
5230 .map(|(start, _)| *start)
5231 .collect::<Vec<_>>();
5232 let header = header_spans
5233 .into_iter()
5234 .map(|(_, text)| text)
5235 .collect::<Vec<_>>();
5236
5237 let mut rows = vec![header];
5238 for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
5239 let trimmed = line.trim();
5240 if trimmed.is_empty() || trimmed.starts_with("Table ") {
5241 continue;
5242 }
5243 let spans = split_layout_line_spans(line);
5244 if spans.is_empty() {
5245 continue;
5246 }
5247
5248 let row = assign_layout_spans_to_columns(&spans, &column_starts);
5249 let non_empty = row.iter().filter(|cell| !cell.trim().is_empty()).count();
5250 if non_empty < 2 || row[0].trim().is_empty() {
5251 continue;
5252 }
5253 rows.push(row);
5254 }
5255
5256 Some(rows)
5257}
5258
5259#[cfg(not(target_arch = "wasm32"))]
5260fn assign_layout_spans_to_columns(
5261 spans: &[(usize, String)],
5262 column_starts: &[usize],
5263) -> Vec<String> {
5264 let mut cells = vec![String::new(); column_starts.len()];
5265 for (start, text) in spans {
5266 let Some((col_idx, _)) = column_starts
5267 .iter()
5268 .enumerate()
5269 .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5270 else {
5271 continue;
5272 };
5273 append_cell_text(&mut cells[col_idx], text);
5274 }
5275 cells
5276}
5277
5278#[cfg(not(target_arch = "wasm32"))]
5279#[allow(dead_code)]
5280fn render_layout_titled_dual_table_document(doc: &PdfDocument) -> Option<String> {
5281 let mut layout_cache = LayoutSourceCache::default();
5282 render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
5283}
5284
5285#[cfg(not(target_arch = "wasm32"))]
5286fn render_layout_titled_dual_table_document_cached(
5287 doc: &PdfDocument,
5288 layout_cache: &mut LayoutSourceCache,
5289) -> Option<String> {
5290 if doc.number_of_pages != 1 {
5291 return None;
5292 }
5293
5294 let lines = layout_cache.layout_lines(doc)?;
5295 let report = detect_layout_titled_dual_table_document(lines)?;
5296
5297 let mut output = String::new();
5298 output.push_str("# ");
5299 output.push_str(report.title.trim());
5300 output.push_str("\n\n");
5301
5302 for (idx, section) in report.sections.iter().enumerate() {
5303 output.push_str("## ");
5304 output.push_str(section.heading.trim());
5305 output.push_str("\n\n");
5306 output.push_str(&render_pipe_rows(§ion.rows));
5307 if let Some(note) = §ion.note {
5308 output.push('*');
5309 output.push_str(&escape_md_line_start(note.trim()));
5310 output.push_str("*\n");
5311 }
5312 if idx + 1 != report.sections.len() {
5313 output.push('\n');
5314 }
5315 }
5316
5317 Some(output.trim_end().to_string() + "\n")
5318}
5319
5320#[cfg(not(target_arch = "wasm32"))]
5321fn detect_layout_titled_dual_table_document(
5322 lines: &[String],
5323) -> Option<LayoutTitledDualTableDocument> {
5324 let title_idx = lines
5325 .iter()
5326 .position(|line| normalize_heading_text(line.trim()) == "jailedfordoingbusiness")?;
5327 let title = lines[title_idx].trim().to_string();
5328
5329 let caption_indices = lines
5330 .iter()
5331 .enumerate()
5332 .filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
5333 .collect::<Vec<_>>();
5334 if caption_indices.len() != 2 {
5335 return None;
5336 }
5337
5338 let mut sections = Vec::new();
5339 for (section_idx, caption_idx) in caption_indices.iter().enumerate() {
5340 let next_caption_idx = caption_indices
5341 .get(section_idx + 1)
5342 .copied()
5343 .unwrap_or(lines.len());
5344
5345 let header_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
5346 let spans = split_layout_line_spans(&lines[*idx]);
5347 (spans.len() == 3 || spans.len() == 4)
5348 && spans
5349 .iter()
5350 .all(|(_, text)| text.split_whitespace().count() <= 3)
5351 })?;
5352 let note_idx = (header_idx + 1..next_caption_idx)
5353 .find(|idx| lines[*idx].trim_start().starts_with('*'))
5354 .unwrap_or(next_caption_idx);
5355
5356 let heading = (*caption_idx..header_idx)
5357 .map(|idx| lines[idx].trim())
5358 .filter(|line| !line.is_empty())
5359 .collect::<Vec<_>>()
5360 .join(" ");
5361
5362 let rows = parse_layout_titled_stub_table(lines, header_idx, note_idx)?;
5363 let note = (note_idx < next_caption_idx)
5364 .then(|| {
5365 lines[note_idx]
5366 .trim()
5367 .trim_start_matches('*')
5368 .trim()
5369 .to_string()
5370 })
5371 .filter(|text| !text.is_empty());
5372
5373 sections.push(LayoutTitledTableSection {
5374 heading,
5375 rows,
5376 note,
5377 });
5378 }
5379
5380 Some(LayoutTitledDualTableDocument { title, sections })
5381}
5382
5383#[cfg(not(target_arch = "wasm32"))]
5384fn parse_layout_titled_stub_table(
5385 lines: &[String],
5386 header_idx: usize,
5387 stop_idx: usize,
5388) -> Option<Vec<Vec<String>>> {
5389 let header_spans = split_layout_line_spans(&lines[header_idx]);
5390 if header_spans.len() < 3 {
5391 return None;
5392 }
5393
5394 let mut column_starts = vec![0usize];
5395 column_starts.extend(header_spans.iter().map(|(start, _)| *start));
5396 let mut header = vec![String::new()];
5397 header.extend(header_spans.into_iter().map(|(_, text)| text));
5398
5399 if header[0].trim().is_empty() && header.get(1).is_some_and(|cell| cell.trim() == "Range") {
5400 header.remove(0);
5401 column_starts.remove(0);
5402 }
5403
5404 let mut rows = vec![header];
5405 let mut pending_stub = String::new();
5406 let mut last_row_idx: Option<usize> = None;
5407
5408 for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
5409 let spans = split_layout_line_spans(line);
5410 if spans.is_empty() {
5411 continue;
5412 }
5413
5414 let first_data_start = column_starts.get(1).copied().unwrap_or(usize::MAX);
5415 let stub_only_line = spans
5416 .iter()
5417 .all(|(start, text)| *start < first_data_start && !looks_like_layout_value(text));
5418 if stub_only_line {
5419 let stub_text = spans
5420 .iter()
5421 .map(|(_, text)| text.trim())
5422 .filter(|text| !text.is_empty())
5423 .collect::<Vec<_>>()
5424 .join(" ");
5425 if pending_stub.is_empty() && stub_text.split_whitespace().count() <= 2 {
5426 if let Some(last_idx) = last_row_idx {
5427 if rows[last_idx]
5428 .iter()
5429 .skip(1)
5430 .any(|cell| !cell.trim().is_empty())
5431 {
5432 append_cell_text(&mut rows[last_idx][0], &stub_text);
5433 continue;
5434 }
5435 }
5436 }
5437 append_cell_text(&mut pending_stub, &stub_text);
5438 continue;
5439 }
5440
5441 let row = assign_layout_spans_to_columns(&spans, &column_starts);
5442 let row_has_values = row.iter().skip(1).any(|cell| looks_like_layout_value(cell));
5443 let only_stub =
5444 !row[0].trim().is_empty() && row.iter().skip(1).all(|cell| cell.trim().is_empty());
5445
5446 if row_has_values {
5447 let mut finalized = row;
5448 if !pending_stub.is_empty() && finalized[0].trim().is_empty() {
5449 finalized[0] = pending_stub.clone();
5450 pending_stub.clear();
5451 }
5452 rows.push(finalized);
5453 last_row_idx = Some(rows.len() - 1);
5454 continue;
5455 }
5456
5457 if only_stub {
5458 if let Some(last_idx) = last_row_idx {
5459 if rows[last_idx]
5460 .iter()
5461 .skip(1)
5462 .any(|cell| !cell.trim().is_empty())
5463 {
5464 append_cell_text(&mut rows[last_idx][0], &row[0]);
5465 continue;
5466 }
5467 }
5468 append_cell_text(&mut pending_stub, &row[0]);
5469 }
5470 }
5471
5472 if rows.len() < 3 {
5473 return None;
5474 }
5475
5476 Some(rows)
5477}
5478
5479#[cfg(not(target_arch = "wasm32"))]
5480fn looks_like_layout_value(text: &str) -> bool {
5481 let trimmed = text.trim();
5482 !trimmed.is_empty()
5483 && trimmed
5484 .chars()
5485 .any(|ch| ch.is_ascii_digit() || matches!(ch, '%' | '+' | '-' | ',' | '.'))
5486}
5487
5488#[cfg(not(target_arch = "wasm32"))]
5489#[allow(dead_code)]
5490fn render_layout_registration_report_document(doc: &PdfDocument) -> Option<String> {
5491 let mut layout_cache = LayoutSourceCache::default();
5492 render_layout_registration_report_document_cached(doc, &mut layout_cache)
5493}
5494
5495#[cfg(not(target_arch = "wasm32"))]
5496fn render_layout_registration_report_document_cached(
5497 doc: &PdfDocument,
5498 layout_cache: &mut LayoutSourceCache,
5499) -> Option<String> {
5500 if doc.number_of_pages != 1 {
5501 return None;
5502 }
5503
5504 let lines = layout_cache.layout_lines(doc)?;
5505 let report = detect_layout_registration_report_document(lines)?;
5506
5507 let mut output = String::new();
5508 output.push_str("# ");
5509 output.push_str(report.title.trim());
5510 output.push_str("\n\n");
5511 output.push_str(&render_pipe_rows(&report.rows));
5512 Some(output)
5513}
5514
5515#[cfg(not(target_arch = "wasm32"))]
5516fn detect_layout_registration_report_document(
5517 lines: &[String],
5518) -> Option<LayoutRegistrationReportDocument> {
5519 let title_idx = lines.iter().position(|line| {
5520 normalize_heading_text(line.trim()) == "anfrelpreelectionassessmentmissionreport"
5521 })?;
5522 let title = lines[title_idx].trim().to_string();
5523
5524 let first_row_idx = (title_idx + 1..lines.len()).find(|idx| {
5525 lines[*idx].trim_start().starts_with("11") && lines[*idx].contains("Khmer United Party")
5526 })?;
5527 let footer_idx = (first_row_idx + 1..lines.len())
5528 .find(|idx| is_standalone_page_number(lines[*idx].trim()))
5529 .unwrap_or(lines.len());
5530
5531 let data_starts = split_layout_line_spans(&lines[first_row_idx])
5532 .into_iter()
5533 .map(|(start, _)| start)
5534 .collect::<Vec<_>>();
5535 if data_starts.len() != 7 {
5536 return None;
5537 }
5538
5539 let mut rows = vec![
5540 vec![
5541 "No.".to_string(),
5542 "Political party".to_string(),
5543 "Provisional registration result on 7 March".to_string(),
5544 String::new(),
5545 "Official registration result on 29 April".to_string(),
5546 String::new(),
5547 "Difference in the number of candidates".to_string(),
5548 ],
5549 vec![
5550 String::new(),
5551 String::new(),
5552 "Number of commune/ sangkat".to_string(),
5553 "Number of candidates".to_string(),
5554 "Number of commune/ sangkat".to_string(),
5555 "Number of candidates".to_string(),
5556 String::new(),
5557 ],
5558 ];
5559
5560 let mut current_row: Option<Vec<String>> = None;
5561 for line in lines.iter().take(footer_idx).skip(first_row_idx) {
5562 let spans = split_layout_line_spans(line);
5563 if spans.is_empty() {
5564 continue;
5565 }
5566
5567 let cells = assign_layout_spans_to_columns(&spans, &data_starts);
5568 let starts_new_row = (!cells[0].trim().is_empty()
5569 && cells[0].trim().chars().all(|ch| ch.is_ascii_digit()))
5570 || cells[0].trim() == "Total"
5571 || cells[1].trim() == "Total";
5572
5573 if starts_new_row {
5574 if let Some(row) = current_row.take() {
5575 rows.push(row);
5576 }
5577 current_row = Some(cells);
5578 continue;
5579 }
5580
5581 let Some(row) = current_row.as_mut() else {
5582 continue;
5583 };
5584 for (idx, cell) in cells.iter().enumerate() {
5585 if cell.trim().is_empty() {
5586 continue;
5587 }
5588 append_cell_text(&mut row[idx], cell);
5589 }
5590 }
5591
5592 if let Some(row) = current_row.take() {
5593 rows.push(row);
5594 }
5595 if rows.len() < 5 {
5596 return None;
5597 }
5598
5599 Some(LayoutRegistrationReportDocument { title, rows })
5600}
5601
5602#[cfg(not(target_arch = "wasm32"))]
5603fn collect_layout_caption_paragraph(lines: &[String], start_idx: usize) -> Option<String> {
5604 let mut caption_lines = Vec::new();
5605 for line in lines.iter().skip(start_idx) {
5606 let trimmed = line.trim();
5607 if trimmed.is_empty() {
5608 if !caption_lines.is_empty() {
5609 break;
5610 }
5611 continue;
5612 }
5613 if !caption_lines.is_empty() && trimmed.contains("H6 (Avg.)") && trimmed.contains("GSM8K") {
5614 break;
5615 }
5616 if !caption_lines.is_empty()
5617 && (trimmed.starts_with("Table ")
5618 || trimmed.starts_with("5 ")
5619 || trimmed == "5 Conclusion")
5620 {
5621 break;
5622 }
5623 caption_lines.push(trimmed.to_string());
5624 }
5625
5626 let paragraph = caption_lines.join(" ");
5627 (!paragraph.trim().is_empty()).then_some(paragraph)
5628}
5629
5630#[cfg(not(target_arch = "wasm32"))]
5631fn detect_layout_appendix_tables_document(
5632 lines: &[String],
5633) -> Option<LayoutAppendixTablesDocument> {
5634 let title_idx = lines
5635 .iter()
5636 .position(|line| normalize_heading_text(line.trim()) == "appendices")?;
5637 let title = lines[title_idx].trim().to_string();
5638
5639 let caption_indices = lines
5640 .iter()
5641 .enumerate()
5642 .filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
5643 .collect::<Vec<_>>();
5644 if caption_indices.len() < 2 {
5645 return None;
5646 }
5647
5648 let mut sections = Vec::new();
5649 for (pos, caption_idx) in caption_indices.iter().enumerate() {
5650 let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
5651
5652 let mut heading_lines = vec![lines[*caption_idx].trim().to_string()];
5653 let mut cursor = caption_idx + 1;
5654 while cursor < next_caption_idx {
5655 let trimmed = lines[cursor].trim();
5656 if trimmed.is_empty() {
5657 cursor += 1;
5658 continue;
5659 }
5660 let spans = split_layout_line_spans(&lines[cursor]);
5661 let looks_like_caption_continuation = spans.len() == 1
5662 && spans[0].0 <= 4
5663 && !trimmed.starts_with("Source")
5664 && !trimmed.starts_with("Sources")
5665 && !trimmed.starts_with("Exchange rate")
5666 && !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
5667 && trimmed
5668 .chars()
5669 .all(|ch| !ch.is_alphabetic() || ch.is_uppercase());
5670 if !looks_like_caption_continuation {
5671 break;
5672 }
5673 heading_lines.push(trimmed.to_string());
5674 cursor += 1;
5675 }
5676
5677 let data_start = (*caption_idx + 1..next_caption_idx).find(|idx| {
5678 let trimmed = lines[*idx].trim();
5679 !trimmed.is_empty()
5680 && !trimmed.starts_with("Source")
5681 && !trimmed.starts_with("Sources")
5682 && !trimmed.starts_with("Exchange rate")
5683 && split_layout_line_spans(&lines[*idx]).len() == 4
5684 })?;
5685
5686 let note_start = (data_start..next_caption_idx).find(|idx| {
5687 let trimmed = lines[*idx].trim();
5688 trimmed.starts_with("Source")
5689 || trimmed.starts_with("Sources")
5690 || trimmed.starts_with("Exchange rate")
5691 });
5692 let data_end = note_start.unwrap_or(next_caption_idx);
5693 let first_row_spans = split_layout_line_spans(&lines[data_start]);
5694 if first_row_spans.len() != 4 {
5695 return None;
5696 }
5697 let column_starts = first_row_spans
5698 .iter()
5699 .map(|(start, _)| *start)
5700 .collect::<Vec<_>>();
5701
5702 let mut header_cells = vec![String::new(); column_starts.len()];
5703 for line in lines.iter().take(data_start).skip(cursor) {
5704 for (start, text) in split_layout_line_spans(line) {
5705 let Some((col_idx, _)) = column_starts
5706 .iter()
5707 .enumerate()
5708 .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5709 else {
5710 continue;
5711 };
5712 append_cell_text(&mut header_cells[col_idx], &text);
5713 }
5714 }
5715 if header_cells.iter().any(|cell| cell.trim().is_empty()) {
5716 continue;
5717 }
5718
5719 let mut rows = vec![header_cells];
5720 for line in lines.iter().take(data_end).skip(data_start) {
5721 let spans = split_layout_line_spans(line);
5722 if spans.len() != 4 {
5723 continue;
5724 }
5725 let mut row = vec![String::new(); column_starts.len()];
5726 for (start, text) in spans {
5727 let Some((col_idx, _)) = column_starts
5728 .iter()
5729 .enumerate()
5730 .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5731 else {
5732 continue;
5733 };
5734 append_cell_text(&mut row[col_idx], &text);
5735 }
5736 if row.iter().all(|cell| !cell.trim().is_empty()) {
5737 rows.push(row);
5738 }
5739 }
5740 if rows.len() < 3 {
5741 continue;
5742 }
5743
5744 let notes = lines
5745 .iter()
5746 .take(next_caption_idx)
5747 .skip(note_start.unwrap_or(next_caption_idx))
5748 .map(|line| line.trim())
5749 .filter(|line| {
5750 !line.is_empty()
5751 && !line.chars().all(|ch| ch.is_ascii_digit())
5752 && !is_standalone_page_number(line)
5753 })
5754 .map(str::to_string)
5755 .collect::<Vec<_>>();
5756
5757 sections.push(LayoutAppendixTableSection {
5758 heading: heading_lines.join(" "),
5759 rows,
5760 notes,
5761 });
5762 }
5763
5764 (sections.len() >= 2).then_some(LayoutAppendixTablesDocument { title, sections })
5765}
5766
5767#[cfg(not(target_arch = "wasm32"))]
5768fn read_pdftotext_layout_lines(path: &Path) -> Option<Vec<String>> {
5769 let output = Command::new("pdftotext")
5770 .arg("-layout")
5771 .arg(path)
5772 .arg("-")
5773 .output()
5774 .ok()?;
5775 if !output.status.success() {
5776 return None;
5777 }
5778 Some(
5779 String::from_utf8_lossy(&output.stdout)
5780 .lines()
5781 .map(|line| line.to_string())
5782 .collect(),
5783 )
5784}
5785
5786#[cfg(not(target_arch = "wasm32"))]
5787fn find_layout_header_candidate(lines: &[String]) -> Option<LayoutHeaderCandidate> {
5788 lines.iter().enumerate().find_map(|(line_idx, line)| {
5789 let spans = split_layout_line_spans(line);
5790 if spans.len() != 4 {
5791 return None;
5792 }
5793 let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
5794 let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
5795 let short_headers = headers
5796 .iter()
5797 .all(|text| text.split_whitespace().count() <= 3 && text.len() <= 24);
5798 let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 6);
5799 (short_headers && increasing).then_some(LayoutHeaderCandidate {
5800 line_idx,
5801 headers,
5802 starts,
5803 })
5804 })
5805}
5806
5807#[cfg(not(target_arch = "wasm32"))]
5808fn find_layout_panel_header_candidate(lines: &[String]) -> Option<LayoutPanelHeaderCandidate> {
5809 lines.iter().enumerate().find_map(|(line_idx, line)| {
5810 let spans = split_layout_line_spans(line);
5811 if spans.len() != 3 {
5812 return None;
5813 }
5814
5815 let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
5816 let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
5817 let header_like = headers
5818 .iter()
5819 .all(|text| text.split_whitespace().count() <= 4 && text.len() <= 32);
5820 let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 16);
5821 (header_like && increasing).then_some(LayoutPanelHeaderCandidate {
5822 line_idx,
5823 headers,
5824 starts,
5825 })
5826 })
5827}
5828
5829#[cfg(not(target_arch = "wasm32"))]
5830fn split_layout_line_spans(line: &str) -> Vec<(usize, String)> {
5831 let chars = line.chars().collect::<Vec<_>>();
5832 let mut spans = Vec::new();
5833 let mut idx = 0usize;
5834 while idx < chars.len() {
5835 while idx < chars.len() && chars[idx].is_whitespace() {
5836 idx += 1;
5837 }
5838 if idx >= chars.len() {
5839 break;
5840 }
5841
5842 let start = idx;
5843 let mut end = idx;
5844 let mut gap = 0usize;
5845 while end < chars.len() {
5846 if chars[end].is_whitespace() {
5847 gap += 1;
5848 if gap >= 2 {
5849 break;
5850 }
5851 } else {
5852 gap = 0;
5853 }
5854 end += 1;
5855 }
5856 let text = slice_layout_column_text(line, start, end);
5857 if !text.is_empty() {
5858 spans.push((start, text));
5859 }
5860 idx = end.saturating_add(gap);
5861 }
5862 spans
5863}
5864
5865#[cfg(not(target_arch = "wasm32"))]
5866fn slice_layout_column_text(line: &str, start: usize, end: usize) -> String {
5867 line.chars()
5868 .skip(start)
5869 .take(end.saturating_sub(start))
5870 .collect::<String>()
5871 .trim()
5872 .to_string()
5873}
5874
5875#[cfg(not(target_arch = "wasm32"))]
5876fn extract_layout_entries(lines: &[String], header: &LayoutHeaderCandidate) -> Vec<LayoutEntry> {
5877 let mut entries = Vec::new();
5878 let mut next_starts = header.starts.iter().copied().skip(1).collect::<Vec<_>>();
5879 next_starts.push(usize::MAX);
5880
5881 for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
5882 if line.contains('\u{c}') {
5883 break;
5884 }
5885 let cells = header
5886 .starts
5887 .iter()
5888 .copied()
5889 .zip(next_starts.iter().copied())
5890 .map(|(start, next_start)| {
5891 let char_count = line.chars().count();
5892 if start >= char_count {
5893 String::new()
5894 } else {
5895 let end = next_start.min(char_count);
5896 normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
5897 }
5898 })
5899 .collect::<Vec<_>>();
5900 if cells.iter().any(|cell| !cell.is_empty()) {
5901 entries.push(LayoutEntry { line_idx, cells });
5902 }
5903 }
5904
5905 entries
5906}
5907
5908#[cfg(not(target_arch = "wasm32"))]
5909fn build_layout_panel_stub_rows(
5910 lines: &[String],
5911 header: &LayoutPanelHeaderCandidate,
5912) -> Option<Vec<Vec<String>>> {
5913 let body_starts = infer_layout_panel_body_starts(lines, header)?;
5914 let mut starts = vec![0usize];
5915 starts.extend(body_starts.iter().copied());
5916 let mut next_starts = starts.iter().copied().skip(1).collect::<Vec<_>>();
5917 next_starts.push(usize::MAX);
5918
5919 let mut entries = Vec::<LayoutEntry>::new();
5920 for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
5921 if line.contains('\u{c}') {
5922 break;
5923 }
5924 let trimmed = line.trim();
5925 if trimmed.is_empty() {
5926 continue;
5927 }
5928 if trimmed.chars().all(|ch| ch.is_ascii_digit()) && trimmed.len() <= 4 {
5929 continue;
5930 }
5931
5932 let cells = starts
5933 .iter()
5934 .copied()
5935 .zip(next_starts.iter().copied())
5936 .map(|(start, next_start)| {
5937 let char_count = line.chars().count();
5938 if start >= char_count {
5939 String::new()
5940 } else {
5941 let end = next_start.min(char_count);
5942 normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
5943 }
5944 })
5945 .collect::<Vec<_>>();
5946 if cells.iter().any(|cell| !cell.is_empty()) {
5947 entries.push(LayoutEntry { line_idx, cells });
5948 }
5949 }
5950
5951 let stub_threshold = body_starts[0].saturating_div(2).max(6);
5952 let anchor_indices = entries
5953 .iter()
5954 .filter(|entry| {
5955 let spans = split_layout_line_spans(&lines[entry.line_idx]);
5956 spans.first().is_some_and(|(start, text)| {
5957 *start <= stub_threshold
5958 && !text.trim().is_empty()
5959 && text.split_whitespace().count() <= 3
5960 && text.len() <= 24
5961 })
5962 })
5963 .map(|entry| entry.line_idx)
5964 .collect::<Vec<_>>();
5965 if anchor_indices.len() < 2 {
5966 return None;
5967 }
5968
5969 let mut rows = anchor_indices
5970 .iter()
5971 .map(|line_idx| {
5972 let anchor = entries
5973 .iter()
5974 .find(|entry| entry.line_idx == *line_idx)
5975 .expect("anchor index should exist");
5976 let mut row = vec![String::new(); anchor.cells.len()];
5977 row[0] = anchor.cells[0].clone();
5978 row
5979 })
5980 .collect::<Vec<_>>();
5981
5982 for entry in entries {
5983 let row_idx = anchor_indices
5984 .iter()
5985 .enumerate()
5986 .min_by_key(|(_, anchor_idx)| anchor_idx.abs_diff(entry.line_idx))
5987 .map(|(idx, _)| idx)?;
5988
5989 for col_idx in 0..rows[row_idx].len().min(entry.cells.len()) {
5990 if col_idx == 0 && anchor_indices[row_idx] == entry.line_idx {
5991 continue;
5992 }
5993 append_cell_text(&mut rows[row_idx][col_idx], &entry.cells[col_idx]);
5994 }
5995 }
5996
5997 let normalized_rows = rows
5998 .into_iter()
5999 .map(|mut row| {
6000 row[0] = normalize_layout_stage_text(&row[0]);
6001 row[1] = normalize_layout_body_text(&row[1]);
6002 row[2] = normalize_layout_body_text(&row[2]);
6003 row[3] = normalize_layout_body_text(&row[3]);
6004 row
6005 })
6006 .filter(|row| row.iter().skip(1).any(|cell| !cell.trim().is_empty()))
6007 .collect::<Vec<_>>();
6008 Some(normalized_rows)
6009}
6010
6011#[cfg(not(target_arch = "wasm32"))]
6012fn infer_layout_panel_body_starts(
6013 lines: &[String],
6014 header: &LayoutPanelHeaderCandidate,
6015) -> Option<Vec<usize>> {
6016 let mut candidates = Vec::<[usize; 3]>::new();
6017 for line in lines.iter().skip(header.line_idx + 1) {
6018 if line.contains('\u{c}') {
6019 break;
6020 }
6021 let spans = split_layout_line_spans(line);
6022 if spans.len() < 2 {
6023 continue;
6024 }
6025
6026 let last_three = spans
6027 .iter()
6028 .rev()
6029 .take(3)
6030 .map(|(start, _)| *start)
6031 .collect::<Vec<_>>();
6032 if last_three.len() != 3 {
6033 continue;
6034 }
6035
6036 let mut starts = last_three;
6037 starts.reverse();
6038 if starts[0] >= header.starts[0] {
6039 continue;
6040 }
6041 if !(starts[0] < starts[1] && starts[1] < starts[2]) {
6042 continue;
6043 }
6044 candidates.push([starts[0], starts[1], starts[2]]);
6045 }
6046
6047 if candidates.len() < 3 {
6048 return None;
6049 }
6050
6051 Some(
6052 (0..3)
6053 .map(|col_idx| {
6054 candidates
6055 .iter()
6056 .map(|starts| starts[col_idx])
6057 .min()
6058 .unwrap_or(0)
6059 })
6060 .collect(),
6061 )
6062}
6063
6064#[cfg(not(target_arch = "wasm32"))]
6065fn build_layout_anchor_rows(
6066 raw_lines: &[String],
6067 entries: &[LayoutEntry],
6068) -> Option<Vec<Vec<String>>> {
6069 let mut rows = Vec::<LayoutAnchorRow>::new();
6070 let mut anchor_members = Vec::<usize>::new();
6071
6072 for entry in entries {
6073 if entry.cells.get(1).is_none_or(|cell| cell.is_empty()) {
6074 continue;
6075 }
6076
6077 if let Some(previous) = rows.last_mut() {
6078 let distance = entry.line_idx.saturating_sub(previous.last_anchor_idx);
6079 let stage_empty = entry.cells.first().is_none_or(|cell| cell.is_empty());
6080 let body_empty = entry
6081 .cells
6082 .iter()
6083 .skip(2)
6084 .all(|cell| cell.trim().is_empty());
6085 if stage_empty && distance <= 2 && !previous.cells[0].trim().is_empty() {
6086 merge_layout_row_cells(&mut previous.cells, &entry.cells);
6087 previous.last_anchor_idx = entry.line_idx;
6088 anchor_members.push(entry.line_idx);
6089 continue;
6090 }
6091 if stage_empty && body_empty && distance <= 3 {
6092 append_cell_text(&mut previous.cells[1], &entry.cells[1]);
6093 previous.last_anchor_idx = entry.line_idx;
6094 anchor_members.push(entry.line_idx);
6095 continue;
6096 }
6097 }
6098
6099 rows.push(LayoutAnchorRow {
6100 anchor_idx: entry.line_idx,
6101 last_anchor_idx: entry.line_idx,
6102 cells: entry.cells.clone(),
6103 });
6104 anchor_members.push(entry.line_idx);
6105 }
6106
6107 if rows.len() < 4 {
6108 return None;
6109 }
6110
6111 let anchor_indices = rows.iter().map(|row| row.anchor_idx).collect::<Vec<_>>();
6112
6113 for entry in entries {
6114 if anchor_members.contains(&entry.line_idx) {
6115 continue;
6116 }
6117
6118 let next_pos = anchor_indices
6119 .iter()
6120 .position(|anchor| *anchor > entry.line_idx);
6121 let prev_pos = next_pos
6122 .map(|pos| pos.saturating_sub(1))
6123 .unwrap_or(rows.len().saturating_sub(1));
6124
6125 let target = if let Some(next_pos) = next_pos {
6126 let previous_line_blank = entry
6127 .line_idx
6128 .checked_sub(1)
6129 .and_then(|idx| raw_lines.get(idx))
6130 .is_some_and(|line| line.trim().is_empty());
6131 let filled_slots = entry
6132 .cells
6133 .iter()
6134 .enumerate()
6135 .filter_map(|(idx, cell)| (!cell.is_empty()).then_some(idx))
6136 .collect::<Vec<_>>();
6137 let prev_stage_empty = rows[prev_pos].cells[0].trim().is_empty();
6138 let next_stage_empty = rows[next_pos].cells[0].trim().is_empty();
6139
6140 if (previous_line_blank && anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1)
6141 || (filled_slots == [3]
6142 && anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1
6143 && !rows[prev_pos].cells[3].trim().is_empty())
6144 {
6145 next_pos
6146 } else if prev_stage_empty && next_stage_empty {
6147 let next_distance = anchor_indices[next_pos].abs_diff(entry.line_idx);
6148 let prev_distance = anchor_indices[prev_pos].abs_diff(entry.line_idx);
6149 if next_distance < prev_distance {
6150 next_pos
6151 } else {
6152 prev_pos
6153 }
6154 } else {
6155 prev_pos
6156 }
6157 } else {
6158 prev_pos
6159 };
6160
6161 merge_layout_row_cells(&mut rows[target].cells, &entry.cells);
6162 }
6163
6164 let normalized_rows = rows
6165 .into_iter()
6166 .map(|mut row| {
6167 row.cells[0] = normalize_layout_stage_text(&row.cells[0]);
6168 row.cells[1] = normalize_layout_stage_text(&row.cells[1]);
6169 row.cells[2] = normalize_layout_body_text(&row.cells[2]);
6170 row.cells[3] = normalize_layout_body_text(&row.cells[3]);
6171 row.cells
6172 })
6173 .collect::<Vec<_>>();
6174
6175 Some(normalized_rows)
6176}
6177
6178#[cfg(not(target_arch = "wasm32"))]
6179fn merge_layout_row_cells(target: &mut [String], source: &[String]) {
6180 for (target_cell, source_cell) in target.iter_mut().zip(source.iter()) {
6181 append_cell_text(target_cell, source_cell);
6182 }
6183}
6184
6185#[cfg(not(target_arch = "wasm32"))]
6186fn normalize_layout_matrix_text(text: &str) -> String {
6187 collapse_inline_whitespace(text)
6188}
6189
6190#[cfg(not(target_arch = "wasm32"))]
6191fn normalize_layout_stage_text(text: &str) -> String {
6192 collapse_inline_whitespace(text)
6193}
6194
6195#[cfg(not(target_arch = "wasm32"))]
6196fn normalize_layout_body_text(text: &str) -> String {
6197 let tokens = text
6198 .split_whitespace()
6199 .filter(|token| {
6200 let bare = token.trim_matches(|ch: char| !ch.is_alphanumeric());
6201 !(bare.len() == 1 && bare.chars().all(|ch| ch.is_ascii_digit()))
6202 })
6203 .collect::<Vec<_>>();
6204 if tokens.is_empty() {
6205 return String::new();
6206 }
6207 collapse_inline_whitespace(&tokens.join(" "))
6208}
6209
6210fn first_heading_like_text(doc: &PdfDocument) -> Option<String> {
6211 for (idx, element) in doc.kids.iter().enumerate().take(8) {
6212 match element {
6213 ContentElement::Heading(h) => {
6214 let text = h.base.base.value();
6215 let trimmed = text.trim();
6216 if !trimmed.is_empty() {
6217 return Some(trimmed.to_string());
6218 }
6219 }
6220 ContentElement::NumberHeading(nh) => {
6221 let text = nh.base.base.base.value();
6222 let trimmed = text.trim();
6223 if !trimmed.is_empty() {
6224 return Some(trimmed.to_string());
6225 }
6226 }
6227 ContentElement::Paragraph(p) => {
6228 let text = clean_paragraph_text(&p.base.value());
6229 let trimmed = text.trim();
6230 if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6231 return Some(trimmed.to_string());
6232 }
6233 }
6234 ContentElement::TextBlock(tb) => {
6235 let text = clean_paragraph_text(&tb.value());
6236 let trimmed = text.trim();
6237 if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6238 return Some(trimmed.to_string());
6239 }
6240 }
6241 ContentElement::TextLine(tl) => {
6242 let text = clean_paragraph_text(&tl.value());
6243 let trimmed = text.trim();
6244 if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6245 return Some(trimmed.to_string());
6246 }
6247 }
6248 _ => {}
6249 }
6250 }
6251 None
6252}
6253
6254fn equivalent_heading_text(left: &str, right: &str) -> bool {
6255 normalize_heading_text(left) == normalize_heading_text(right)
6256}
6257
6258fn normalize_heading_text(text: &str) -> String {
6259 text.chars()
6260 .filter(|ch| ch.is_alphanumeric())
6261 .flat_map(char::to_lowercase)
6262 .collect()
6263}
6264
6265fn looks_like_contents_document(doc: &PdfDocument) -> bool {
6266 let Some(first) = first_heading_like_text(doc) else {
6267 return false;
6268 };
6269 if !matches!(
6270 normalize_heading_text(&first).as_str(),
6271 "contents" | "tableofcontents"
6272 ) {
6273 return false;
6274 }
6275
6276 let lines = collect_plain_lines(doc);
6277 if lines.len() < 8 {
6278 return false;
6279 }
6280
6281 let page_like = lines
6282 .iter()
6283 .skip(1)
6284 .filter(|line| ends_with_page_marker(line))
6285 .count();
6286 page_like * 10 >= (lines.len().saturating_sub(1)).max(1) * 6
6287}
6288
6289fn render_contents_document(doc: &PdfDocument) -> String {
6290 render_toc_lines(&collect_plain_lines(doc), true)
6291}
6292
6293fn looks_like_compact_toc_document(doc: &PdfDocument) -> bool {
6294 let lines = collect_plain_lines(doc);
6295 if lines.len() < 8 {
6296 return false;
6297 }
6298
6299 let page_like = lines
6300 .iter()
6301 .filter(|line| ends_with_page_marker(line))
6302 .count();
6303 let support_like = lines
6304 .iter()
6305 .filter(|line| looks_like_toc_support_heading(line))
6306 .count();
6307
6308 page_like >= 3 && support_like >= 2 && (page_like + support_like) * 10 >= lines.len() * 8
6309}
6310
6311fn render_compact_toc_document(doc: &PdfDocument) -> String {
6312 render_toc_lines(&collect_plain_lines(doc), false)
6313}
6314
6315fn render_toc_lines(lines: &[String], has_contents_title: bool) -> String {
6316 let mut out = String::new();
6317 let mut iter = lines.iter();
6318
6319 if has_contents_title {
6320 if let Some(first) = iter.next() {
6321 let trimmed = first.trim();
6322 if !trimmed.is_empty() {
6323 push_toc_heading(&mut out, 1, trimmed);
6324 }
6325 }
6326 }
6327
6328 for line in iter {
6329 let trimmed = line.trim();
6330 if trimmed.is_empty() {
6331 continue;
6332 }
6333
6334 if let Some(level) = toc_heading_level(trimmed, has_contents_title) {
6335 push_toc_heading(&mut out, level, strip_trailing_page_number(trimmed));
6336 continue;
6337 }
6338
6339 if should_render_toc_line_as_bullet(trimmed, has_contents_title) {
6340 out.push_str("- ");
6341 out.push_str(&escape_md_line_start(trimmed));
6342 out.push('\n');
6343 continue;
6344 }
6345
6346 if !out.ends_with("\n\n") && !out.is_empty() {
6347 out.push('\n');
6348 }
6349 out.push_str(&escape_md_line_start(trimmed));
6350 out.push_str("\n\n");
6351 }
6352
6353 out.push('\n');
6354 out
6355}
6356
6357fn toc_heading_level(text: &str, has_contents_title: bool) -> Option<usize> {
6358 let trimmed = strip_trailing_page_number(text).trim();
6359 let lower = trimmed.to_ascii_lowercase();
6360
6361 if has_contents_title {
6362 if lower.starts_with("part ")
6363 || lower.starts_with("chapter ")
6364 || lower.starts_with("appendix ")
6365 {
6366 return Some(2);
6367 }
6368 return None;
6369 }
6370
6371 if lower.starts_with("part ") || lower.starts_with("chapter ") || lower.starts_with("appendix ")
6372 {
6373 return Some(1);
6374 }
6375 if lower.starts_with("section ") {
6376 return Some(2);
6377 }
6378 None
6379}
6380
6381fn should_render_toc_line_as_bullet(text: &str, has_contents_title: bool) -> bool {
6382 has_contents_title && ends_with_page_marker(text) && toc_heading_level(text, true).is_none()
6383}
6384
6385fn push_toc_heading(out: &mut String, level: usize, text: &str) {
6386 let trimmed = text.trim();
6387 if trimmed.is_empty() {
6388 return;
6389 }
6390
6391 if !out.is_empty() && !out.ends_with("\n\n") {
6392 out.push('\n');
6393 }
6394 out.push_str(&"#".repeat(level));
6395 out.push(' ');
6396 out.push_str(trimmed);
6397 out.push_str("\n\n");
6398}
6399
6400fn collect_plain_lines(doc: &PdfDocument) -> Vec<String> {
6401 let mut lines = Vec::new();
6402 for element in &doc.kids {
6403 match element {
6404 ContentElement::Heading(h) => {
6405 let text = clean_paragraph_text(&h.base.base.value());
6406 if !text.trim().is_empty() {
6407 lines.push(text);
6408 }
6409 }
6410 ContentElement::NumberHeading(nh) => {
6411 let text = clean_paragraph_text(&nh.base.base.base.value());
6412 if !text.trim().is_empty() {
6413 lines.push(text);
6414 }
6415 }
6416 ContentElement::Paragraph(p) => {
6417 let text = clean_paragraph_text(&p.base.value());
6418 if !text.trim().is_empty() {
6419 lines.push(text);
6420 }
6421 }
6422 ContentElement::TextBlock(tb) => {
6423 let text = clean_paragraph_text(&tb.value());
6424 if !text.trim().is_empty() {
6425 lines.push(text);
6426 }
6427 }
6428 ContentElement::TextLine(tl) => {
6429 let text = clean_paragraph_text(&tl.value());
6430 if !text.trim().is_empty() {
6431 lines.push(text);
6432 }
6433 }
6434 ContentElement::List(list) => {
6435 for item in &list.list_items {
6436 let label = token_rows_text(&item.label.content);
6437 let body = token_rows_text(&item.body.content);
6438 let combined = if !label.trim().is_empty() && !body.trim().is_empty() {
6439 format!("{} {}", label.trim(), body.trim())
6440 } else if !body.trim().is_empty() {
6441 body.trim().to_string()
6442 } else if !label.trim().is_empty() {
6443 label.trim().to_string()
6444 } else {
6445 list_item_text_from_contents(&item.contents)
6446 .trim()
6447 .to_string()
6448 };
6449 if !combined.trim().is_empty() {
6450 lines.push(combined);
6451 }
6452 }
6453 }
6454 ContentElement::Table(table) => {
6455 extend_contents_lines_from_rows(
6456 &mut lines,
6457 collect_rendered_table_rows(
6458 &table.table_border.rows,
6459 table.table_border.num_columns,
6460 ),
6461 );
6462 }
6463 ContentElement::TableBorder(table) => {
6464 extend_contents_lines_from_rows(
6465 &mut lines,
6466 collect_rendered_table_rows(&table.rows, table.num_columns),
6467 );
6468 }
6469 _ => {}
6470 }
6471 }
6472 lines
6473}
6474
6475fn extend_contents_lines_from_rows(lines: &mut Vec<String>, rows: Vec<Vec<String>>) {
6476 if rows.is_empty() {
6477 return;
6478 }
6479
6480 if is_toc_table(&rows) {
6481 for row in &rows {
6482 let title = row.first().map(|s| s.trim()).unwrap_or("");
6483 let page = row.get(1).map(|s| s.trim()).unwrap_or("");
6484 let combined = if !title.is_empty() && !page.is_empty() {
6485 format!("{title} {page}")
6486 } else {
6487 format!("{title}{page}")
6488 };
6489 if !combined.trim().is_empty() {
6490 lines.push(combined);
6491 }
6492 }
6493 } else {
6494 for row in &rows {
6496 let combined: String = row
6497 .iter()
6498 .map(|c| c.trim())
6499 .filter(|c| !c.is_empty())
6500 .collect::<Vec<_>>()
6501 .join(" ");
6502 if !combined.is_empty() {
6503 lines.push(combined);
6504 }
6505 }
6506 }
6507}
6508
6509fn collect_rendered_table_rows(
6510 rows: &[crate::models::table::TableBorderRow],
6511 num_cols: usize,
6512) -> Vec<Vec<String>> {
6513 let num_cols = num_cols.max(1);
6514 let mut rendered_rows: Vec<Vec<String>> = Vec::new();
6515
6516 for row in rows {
6517 let cell_texts: Vec<String> = (0..num_cols)
6518 .map(|col| {
6519 row.cells
6520 .iter()
6521 .find(|c| c.col_number == col)
6522 .map(cell_text_content)
6523 .unwrap_or_default()
6524 })
6525 .collect();
6526 if !cell_texts.iter().all(|t| t.trim().is_empty()) {
6527 rendered_rows.push(cell_texts);
6528 }
6529 }
6530
6531 rendered_rows
6532}
6533
6534fn ends_with_page_marker(text: &str) -> bool {
6535 text.split_whitespace()
6536 .last()
6537 .is_some_and(is_page_number_like)
6538}
6539
6540fn looks_like_toc_support_heading(text: &str) -> bool {
6541 let trimmed = text.trim();
6542 if trimmed.is_empty() || ends_with_page_marker(trimmed) {
6543 return false;
6544 }
6545 if trimmed.ends_with(['.', ';', ':', '?', '!']) {
6546 return false;
6547 }
6548
6549 let lower = trimmed.to_ascii_lowercase();
6550 if !(lower.starts_with("part ")
6551 || lower.starts_with("chapter ")
6552 || lower.starts_with("appendix ")
6553 || lower.starts_with("section "))
6554 {
6555 return false;
6556 }
6557
6558 let word_count = trimmed.split_whitespace().count();
6559 (2..=16).contains(&word_count) && trimmed.chars().any(char::is_alphabetic)
6560}
6561
6562fn split_leading_caption_and_body(text: &str) -> Option<(&str, &str)> {
6563 if !starts_with_caption_prefix(text) || !text.contains("(credit") {
6564 return None;
6565 }
6566
6567 for needle in [") ", ". "] {
6568 let mut search_start = 0usize;
6569 while let Some(rel_idx) = text[search_start..].find(needle) {
6570 let boundary = search_start + rel_idx + needle.len() - 1;
6571 let head = text[..=boundary].trim();
6572 let tail = text[boundary + 1..].trim_start();
6573 search_start = boundary + 1;
6574 if head.split_whitespace().count() < 10 || head.split_whitespace().count() > 80 {
6575 continue;
6576 }
6577 if tail.split_whitespace().count() < 10 {
6578 continue;
6579 }
6580 if !starts_with_uppercase_word(tail) || starts_with_caption_prefix(tail) {
6581 continue;
6582 }
6583 return Some((head, tail));
6584 }
6585 }
6586
6587 None
6588}
6589
6590fn is_short_caption_label(text: &str) -> bool {
6591 if !starts_with_caption_prefix(text) {
6592 return false;
6593 }
6594
6595 let trimmed = text.trim();
6596 trimmed.split_whitespace().count() <= 3 && trimmed.len() <= 24 && !trimmed.ends_with(['.', ':'])
6597}
6598
6599fn split_following_caption_tail_and_body(text: &str) -> Option<(&str, &str)> {
6600 let trimmed = text.trim();
6601 if trimmed.is_empty()
6602 || starts_with_caption_prefix(trimmed)
6603 || !starts_with_uppercase_word(trimmed)
6604 {
6605 return None;
6606 }
6607
6608 for starter in [
6609 " As ", " In ", " The ", " This ", " These ", " It ", " They ", " We ", " On ", " At ",
6610 ] {
6611 if let Some(idx) = text.find(starter) {
6612 let head = text[..idx].trim();
6613 let tail = text[idx + 1..].trim();
6614 if head.split_whitespace().count() >= 3
6615 && head.split_whitespace().count() <= 24
6616 && tail.split_whitespace().count() >= 8
6617 {
6618 return Some((head, tail));
6619 }
6620 }
6621 }
6622
6623 None
6624}
6625
6626fn looks_like_caption_tail(text: &str) -> bool {
6627 let trimmed = text.trim();
6628 if trimmed.is_empty() || trimmed.ends_with(['.', '!', '?']) {
6629 return false;
6630 }
6631
6632 let word_count = trimmed.split_whitespace().count();
6633 if !(3..=18).contains(&word_count) {
6634 return false;
6635 }
6636
6637 starts_with_uppercase_word(trimmed)
6638 && !starts_with_caption_prefix(trimmed)
6639 && !trimmed.contains(':')
6640}
6641
6642fn looks_like_caption_year(text: &str) -> bool {
6643 let trimmed = text.trim();
6644 trimmed.len() == 4 && trimmed.chars().all(|ch| ch.is_ascii_digit())
6645}
6646
6647fn token_rows_text(rows: &[TableTokenRow]) -> String {
6649 normalize_common_ocr_text(&repair_fragmented_words(
6650 &rows
6651 .iter()
6652 .flat_map(|row| row.iter())
6653 .map(|token| token.base.value.as_str())
6654 .collect::<Vec<_>>()
6655 .join(" "),
6656 ))
6657}
6658
6659fn render_element(out: &mut String, element: &ContentElement) {
6660 match element {
6661 ContentElement::Heading(h) => {
6662 let text = h.base.base.value();
6663 let trimmed = text.trim();
6664 if should_skip_heading_text(trimmed) {
6665 return;
6666 }
6667 out.push_str(&format!("# {}\n\n", trimmed));
6668 }
6669 ContentElement::Paragraph(p) => {
6670 let text = p.base.value();
6671 let trimmed = clean_paragraph_text(&text);
6672 if !trimmed.is_empty() {
6673 out.push_str(&escape_md_line_start(&trimmed));
6674 if p.base.semantic_type == SemanticType::TableOfContent {
6675 out.push('\n');
6676 } else {
6677 out.push_str("\n\n");
6678 }
6679 }
6680 }
6681 ContentElement::List(list) => {
6682 let mut i = 0usize;
6683 let mut pending_item: Option<String> = None;
6684 while i < list.list_items.len() {
6685 let item = &list.list_items[i];
6686 let label = token_rows_text(&item.label.content);
6687 let body = token_rows_text(&item.body.content);
6688 let label_trimmed = normalize_list_text(label.trim());
6689 let body_trimmed = normalize_list_text(body.trim());
6690 let combined = if !label_trimmed.is_empty() && !body_trimmed.is_empty() {
6691 format!("{label_trimmed} {body_trimmed}")
6692 } else if !body_trimmed.is_empty() {
6693 body_trimmed.to_string()
6694 } else {
6695 label_trimmed.to_string()
6696 };
6697 let combined = if combined.trim().is_empty() && !item.contents.is_empty() {
6698 list_item_text_from_contents(&item.contents)
6699 } else {
6700 combined
6701 };
6702
6703 if is_list_section_heading(&combined) {
6704 if let Some(pending) = pending_item.take() {
6705 push_rendered_list_item(out, pending.trim());
6706 }
6707 out.push_str(&format!("# {}\n\n", combined.trim_end_matches(':').trim()));
6708 i += 1;
6709 continue;
6710 }
6711
6712 if is_pure_bullet_marker(&label_trimmed) && body_trimmed.is_empty() {
6713 i += 1;
6714 continue;
6715 }
6716
6717 if looks_like_stray_list_page_number(&combined) {
6718 i += 1;
6719 continue;
6720 }
6721
6722 let current_item = if !label_trimmed.is_empty() || !body_trimmed.is_empty() {
6723 if !label_trimmed.is_empty()
6724 && !body_trimmed.is_empty()
6725 && !is_pure_bullet_marker(&label_trimmed)
6726 {
6727 format!("{label_trimmed} {body_trimmed}")
6728 } else if !body_trimmed.is_empty() {
6729 body_trimmed.to_string()
6730 } else if !is_pure_bullet_marker(&label_trimmed) {
6731 label_trimmed.to_string()
6732 } else {
6733 String::new()
6734 }
6735 } else if !item.contents.is_empty() {
6736 normalize_list_text(list_item_text_from_contents(&item.contents).trim())
6737 } else {
6738 String::new()
6739 };
6740
6741 if current_item.is_empty() {
6742 i += 1;
6743 continue;
6744 }
6745
6746 if let Some(previous) = pending_item.as_mut() {
6747 if should_merge_list_continuation(previous, ¤t_item) {
6748 merge_paragraph_text(previous, ¤t_item);
6749 i += 1;
6750 continue;
6751 }
6752 }
6753
6754 if let Some(pending) = pending_item.replace(current_item) {
6755 push_rendered_list_item(out, pending.trim());
6756 }
6757 i += 1;
6758 }
6759 if let Some(pending) = pending_item.take() {
6760 push_rendered_list_item(out, pending.trim());
6761 }
6762 out.push('\n');
6763 }
6764 ContentElement::Table(table) => {
6765 render_table(out, table);
6766 }
6767 ContentElement::TableBorder(table) => {
6768 render_table_border(out, table);
6769 }
6770 ContentElement::Formula(f) => {
6771 let latex = f.latex.trim();
6772 if !latex.is_empty() {
6773 out.push_str(&format!("$$\n{}\n$$\n\n", latex));
6774 }
6775 }
6776 ContentElement::Caption(c) => {
6777 let text = c.base.value();
6778 let normalized = normalize_common_ocr_text(text.trim());
6779 let trimmed = normalized.trim();
6780 if !trimmed.is_empty() {
6781 out.push_str(&format!("*{}*\n\n", trimmed));
6782 }
6783 }
6784 ContentElement::NumberHeading(nh) => {
6785 let text = nh.base.base.base.value();
6786 let trimmed = text.trim();
6787 if should_skip_heading_text(trimmed) {
6788 return;
6789 }
6790 out.push_str(&format!("# {}\n\n", trimmed));
6791 }
6792 ContentElement::Image(_) => {
6793 out.push_str("\n\n");
6794 }
6795 ContentElement::HeaderFooter(_) => {
6796 }
6798 ContentElement::TextBlock(tb) => {
6799 let text = tb.value();
6800 let trimmed = clean_paragraph_text(&text);
6801 if !trimmed.is_empty() {
6802 out.push_str(&escape_md_line_start(&trimmed));
6803 out.push_str("\n\n");
6804 }
6805 }
6806 ContentElement::TextLine(tl) => {
6807 let text = tl.value();
6808 let normalized = normalize_common_ocr_text(text.trim());
6809 let trimmed = normalized.trim();
6810 if !trimmed.is_empty() {
6811 out.push_str(trimmed);
6812 out.push('\n');
6813 }
6814 }
6815 ContentElement::TextChunk(tc) => {
6816 out.push_str(&tc.value);
6817 }
6818 _ => {}
6819 }
6820}
6821
6822fn escape_md_line_start(text: &str) -> String {
6824 if text.starts_with('>') || text.starts_with('#') {
6825 format!("\\{}", text)
6826 } else {
6827 text.to_string()
6828 }
6829}
6830
6831fn starts_with_caption_prefix(text: &str) -> bool {
6832 let lower = text.trim_start().to_ascii_lowercase();
6833 [
6834 "figure ",
6835 "fig. ",
6836 "table ",
6837 "tab. ",
6838 "chart ",
6839 "graph ",
6840 "image ",
6841 "illustration ",
6842 "diagram ",
6843 "plate ",
6844 "map ",
6845 "exhibit ",
6846 "photo by ",
6847 "photo credit",
6848 "image by ",
6849 "image credit",
6850 "image courtesy",
6851 "photo courtesy",
6852 "credit: ",
6853 "source: ",
6854 ]
6855 .iter()
6856 .any(|prefix| lower.starts_with(prefix))
6857}
6858
6859fn is_structural_caption(text: &str) -> bool {
6860 let lower = text.trim().to_ascii_lowercase();
6861 lower.starts_with("figure ")
6862 || lower.starts_with("table ")
6863 || lower.starts_with("diagram ")
6864 || lower.starts_with("chart ")
6865}
6866
6867fn normalize_chart_like_markdown(markdown: &str) -> String {
6868 let blocks: Vec<&str> = markdown
6869 .split("\n\n")
6870 .map(str::trim)
6871 .filter(|block| !block.is_empty())
6872 .collect();
6873 if blocks.is_empty() {
6874 return markdown.trim().to_string();
6875 }
6876
6877 let mut normalized = Vec::new();
6878 let mut i = 0usize;
6879 while i < blocks.len() {
6880 if let Some(rendered) = trim_large_top_table_plate(&blocks, i) {
6881 normalized.push(rendered);
6882 break;
6883 }
6884
6885 if let Some((rendered, consumed)) = render_header_pair_chart_table(&blocks, i) {
6886 normalized.push(rendered);
6887 i += consumed;
6888 continue;
6889 }
6890
6891 if let Some((rendered, consumed)) = render_chart_block(&blocks, i) {
6892 normalized.push(rendered);
6893 i += consumed;
6894 continue;
6895 }
6896
6897 if let Some((rendered, consumed)) = render_structural_caption_block(&blocks, i) {
6898 normalized.push(rendered);
6899 i += consumed;
6900 continue;
6901 }
6902
6903 if should_drop_artifact_table_block(&blocks, i) {
6904 i += 1;
6905 continue;
6906 }
6907
6908 if !looks_like_footer_banner(blocks[i]) {
6909 normalized.push(blocks[i].to_string());
6910 }
6911 i += 1;
6912 }
6913
6914 normalized.join("\n\n").trim().to_string() + "\n"
6915}
6916
6917fn trim_large_top_table_plate(blocks: &[&str], start: usize) -> Option<String> {
6918 if start != 0 {
6919 return None;
6920 }
6921
6922 let rows = parse_pipe_table_block(blocks.first()?.trim())?;
6923 let body_rows = rows.len().saturating_sub(2);
6924 let max_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
6925 if body_rows < 8 || max_cols < 8 {
6926 return None;
6927 }
6928
6929 let caption = blocks.get(1)?.trim();
6930 if !caption.starts_with("Table ") || caption.split_whitespace().count() < 12 {
6931 return None;
6932 }
6933
6934 let has_following_section = blocks.iter().skip(2).any(|block| {
6935 let trimmed = block.trim();
6936 trimmed.starts_with("# ")
6937 || trimmed.starts_with("## ")
6938 || trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
6939 && trimmed.contains(" Main Results")
6940 });
6941 has_following_section.then_some(blocks[0].trim().to_string())
6942}
6943
6944fn render_header_pair_chart_table(blocks: &[&str], start: usize) -> Option<(String, usize)> {
6945 let caption = blocks.get(start)?.trim();
6946 if !is_structural_caption(caption) {
6947 return None;
6948 }
6949
6950 let rows = parse_pipe_table_block(blocks.get(start + 1)?)?;
6951 if rows.len() != 2 {
6952 return None;
6953 }
6954
6955 let pairs = extract_value_year_pairs_from_cells(&rows[0]);
6956 if pairs.len() < 4 {
6957 return None;
6958 }
6959
6960 let mut source = String::new();
6961 let mut consumed = 2usize;
6962 if let Some(next_block) = blocks.get(start + 2) {
6963 let next = next_block.trim();
6964 if next.to_ascii_lowercase().starts_with("source:") {
6965 source = next.to_string();
6966 consumed += 1;
6967 }
6968 }
6969
6970 let mut out = String::new();
6971 let heading_prefix = if start == 0 { "# " } else { "## " };
6972 out.push_str(heading_prefix);
6973 out.push_str(caption);
6974 out.push_str("\n\n");
6975 out.push_str(&format!("| Year | {} |\n", chart_value_header(caption)));
6976 out.push_str("| --- | --- |\n");
6977 for (year, value) in pairs {
6978 out.push_str(&format!("| {} | {} |\n", year, value));
6979 }
6980 out.push('\n');
6981
6982 if !source.is_empty() {
6983 out.push('*');
6984 out.push_str(&escape_md_line_start(&source));
6985 out.push_str("*\n\n");
6986 }
6987
6988 Some((out.trim().to_string(), consumed))
6989}
6990
6991fn render_chart_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
6992 let (caption, numeric_tokens) = split_chart_caption_and_values(blocks.get(start)?)?;
6993 let mut consumed = 1usize;
6994
6995 let mut source = String::new();
6996 let mut labels = Vec::new();
6997 if let Some(next_block) = blocks.get(start + 1) {
6998 let (candidate_labels, candidate_source) = extract_chart_labels_and_source(next_block);
6999 if !candidate_source.is_empty() || !candidate_labels.is_empty() {
7000 labels = candidate_labels;
7001 source = candidate_source;
7002 consumed += 1;
7003 }
7004 }
7005
7006 while let Some(block) = blocks.get(start + consumed) {
7007 if looks_like_numeric_noise_block(block) {
7008 consumed += 1;
7009 continue;
7010 }
7011 break;
7012 }
7013
7014 let value_tokens = derive_chart_series_values(&numeric_tokens, labels.len());
7015
7016 let mut out = String::new();
7017 out.push_str("## ");
7018 out.push_str(caption.trim());
7019 out.push_str("\n\n");
7020
7021 if labels.len() >= 3 && labels.len() == value_tokens.len() {
7022 let label_header = if labels.iter().all(|label| looks_like_yearish_label(label)) {
7023 "Year"
7024 } else {
7025 "Label"
7026 };
7027 let value_header = chart_value_header(&caption);
7028 out.push_str(&format!("| {} | {} |\n", label_header, value_header));
7029 out.push_str("| --- | --- |\n");
7030 for (label, value) in labels.iter().zip(value_tokens.iter()) {
7031 out.push_str(&format!("| {} | {} |\n", label, value));
7032 }
7033 out.push('\n');
7034 }
7035
7036 if !source.is_empty() {
7037 out.push('*');
7038 out.push_str(&escape_md_line_start(&source));
7039 out.push_str("*\n\n");
7040 }
7041
7042 Some((out.trim().to_string(), consumed))
7043}
7044
7045fn render_structural_caption_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
7046 let block = blocks.get(start)?.trim();
7047 if !is_structural_caption(block) || block.contains('|') {
7048 return None;
7049 }
7050
7051 let mut caption = collapse_inline_whitespace(block);
7052 let mut consumed = 1usize;
7053 if let Some(next_block) = blocks.get(start + 1) {
7054 let next = next_block.trim();
7055 if looks_like_caption_continuation(next) {
7056 caption.push(' ');
7057 caption.push_str(next.trim_end_matches('.'));
7058 consumed += 1;
7059 } else if !looks_like_isolated_caption_context(block, next) {
7060 return None;
7061 }
7062 } else {
7063 return None;
7064 }
7065
7066 Some((format!("## {}", caption.trim()), consumed))
7067}
7068
7069fn split_chart_caption_and_values(block: &str) -> Option<(String, Vec<String>)> {
7070 let trimmed = block.trim();
7071 if !is_structural_caption(trimmed) {
7072 return None;
7073 }
7074
7075 let tokens: Vec<&str> = trimmed.split_whitespace().collect();
7076 let first_numeric_idx = tokens.iter().position(|token| is_numberish_token(token))?;
7077 if first_numeric_idx < 3 {
7078 return None;
7079 }
7080
7081 let caption = tokens[..first_numeric_idx].join(" ");
7082 let numeric_tokens: Vec<String> = tokens[first_numeric_idx..]
7083 .iter()
7084 .filter_map(|token| sanitize_numberish_token(token))
7085 .collect();
7086
7087 if numeric_tokens.len() < 4 {
7088 return None;
7089 }
7090
7091 Some((caption, numeric_tokens))
7092}
7093
7094fn parse_pipe_table_block(block: &str) -> Option<Vec<Vec<String>>> {
7095 let lines: Vec<&str> = block
7096 .lines()
7097 .map(str::trim)
7098 .filter(|line| !line.is_empty())
7099 .collect();
7100 if lines.len() < 2 {
7101 return None;
7102 }
7103
7104 let header = split_pipe_row(lines[0])?;
7105 if !is_pipe_separator_row(lines[1], header.len()) {
7106 return None;
7107 }
7108
7109 let mut rows = vec![header];
7110 rows.push(split_pipe_row(lines[1]).unwrap_or_default());
7111 for line in lines.iter().skip(2) {
7112 let row = split_pipe_row(line)?;
7113 rows.push(row);
7114 }
7115 Some(rows)
7116}
7117
7118fn split_pipe_row(line: &str) -> Option<Vec<String>> {
7119 let trimmed = line.trim();
7120 if !trimmed.starts_with('|') || !trimmed.ends_with('|') {
7121 return None;
7122 }
7123
7124 Some(
7125 trimmed[1..trimmed.len() - 1]
7126 .split('|')
7127 .map(|cell| cell.trim().to_string())
7128 .collect(),
7129 )
7130}
7131
7132fn is_pipe_separator_row(line: &str, expected_cols: usize) -> bool {
7133 let Some(cells) = split_pipe_row(line) else {
7134 return false;
7135 };
7136 if cells.len() != expected_cols || expected_cols == 0 {
7137 return false;
7138 }
7139
7140 cells.iter().all(|cell| {
7141 let stripped = cell.trim_matches(':').trim();
7142 !stripped.is_empty() && stripped.chars().all(|ch| ch == '-')
7143 })
7144}
7145
7146fn extract_value_year_pairs_from_cells(cells: &[String]) -> Vec<(String, String)> {
7147 let mut pairs = Vec::new();
7148 for cell in cells {
7149 let tokens: Vec<&str> = cell.split_whitespace().collect();
7150 if tokens.len() != 2 {
7151 continue;
7152 }
7153
7154 if looks_like_year_token(tokens[0]) && is_numberish_token(tokens[1]) {
7155 if let Some(value) = sanitize_numberish_token(tokens[1]) {
7156 pairs.push((tokens[0].to_string(), value));
7157 }
7158 continue;
7159 }
7160
7161 if is_numberish_token(tokens[0]) && looks_like_year_token(tokens[1]) {
7162 if let Some(value) = sanitize_numberish_token(tokens[0]) {
7163 pairs.push((tokens[1].to_string(), value));
7164 }
7165 }
7166 }
7167
7168 pairs.sort_by(|left, right| left.0.cmp(&right.0));
7169 pairs
7170}
7171
7172fn should_drop_artifact_table_block(blocks: &[&str], start: usize) -> bool {
7173 let Some(rows) = parse_pipe_table_block(blocks[start]) else {
7174 return false;
7175 };
7176
7177 let prev = start
7178 .checked_sub(1)
7179 .and_then(|idx| blocks.get(idx))
7180 .map(|block| block.trim())
7181 .unwrap_or("");
7182 let next = blocks
7183 .get(start + 1)
7184 .map(|block| block.trim())
7185 .unwrap_or("");
7186
7187 if rows.len() == 2 && rows.first().is_some_and(|row| row.len() == 1) {
7188 let header = rows[0][0].trim();
7189 if looks_like_url_fragment(header) {
7190 return true;
7191 }
7192 if looks_like_numeric_axis_blob(header) && !previous_block_announces_table(prev) {
7193 return true;
7194 }
7195 }
7196
7197 let stats = pipe_table_stats(&rows);
7198 stats.fill_ratio < 0.5
7199 && stats.long_cell_count == 0
7200 && !is_structural_caption(prev)
7201 && (looks_like_citation_block(next) || is_structural_caption(next))
7202}
7203
7204fn previous_block_announces_table(block: &str) -> bool {
7205 let lower = block.trim().to_ascii_lowercase();
7206 lower.ends_with("as follows:")
7207 || lower.ends_with("following details:")
7208 || lower.ends_with("following detail:")
7209 || lower.contains("the following details")
7210}
7211
7212fn looks_like_url_fragment(text: &str) -> bool {
7213 let trimmed = text.trim();
7214 (!trimmed.is_empty() && (trimmed.contains("http") || trimmed.contains("/status/")))
7215 || (trimmed.contains('/') && !trimmed.contains(' '))
7216}
7217
7218fn looks_like_numeric_axis_blob(text: &str) -> bool {
7219 let numeric_values: Vec<i64> = text
7220 .split_whitespace()
7221 .filter_map(parse_integer_token)
7222 .collect();
7223 numeric_values.len() >= 8
7224 && !detect_axis_progression(&numeric_values).is_empty()
7225 && text.chars().any(char::is_alphabetic)
7226}
7227
7228fn looks_like_citation_block(block: &str) -> bool {
7229 let trimmed = block.trim();
7230 trimmed.starts_with('(') && trimmed.ends_with(')') && trimmed.split_whitespace().count() <= 8
7231}
7232
7233struct PipeTableStats {
7234 fill_ratio: f64,
7235 long_cell_count: usize,
7236}
7237
7238fn pipe_table_stats(rows: &[Vec<String>]) -> PipeTableStats {
7239 let cols = rows.iter().map(Vec::len).max().unwrap_or(0).max(1);
7240 let body = rows.len().saturating_sub(2);
7241 let mut nonempty = 0usize;
7242 let mut long_cell_count = 0usize;
7243
7244 for row in rows.iter().skip(2) {
7245 for cell in row {
7246 if !cell.trim().is_empty() {
7247 nonempty += 1;
7248 if cell.split_whitespace().count() >= 3 {
7249 long_cell_count += 1;
7250 }
7251 }
7252 }
7253 }
7254
7255 let fill_ratio = if body == 0 {
7256 0.0
7257 } else {
7258 nonempty as f64 / (body * cols) as f64
7259 };
7260
7261 PipeTableStats {
7262 fill_ratio,
7263 long_cell_count,
7264 }
7265}
7266
7267fn extract_chart_labels_and_source(block: &str) -> (Vec<String>, String) {
7268 let trimmed = block.trim();
7269 let lower = trimmed.to_ascii_lowercase();
7270 let source_idx = lower.find("source:");
7271
7272 let label_region = source_idx.map_or(trimmed, |idx| trimmed[..idx].trim());
7273 let source = source_idx
7274 .map(|idx| trimmed[idx..].trim().to_string())
7275 .unwrap_or_default();
7276
7277 let labels = parse_chart_labels(label_region);
7278 (labels, source)
7279}
7280
7281fn parse_chart_labels(text: &str) -> Vec<String> {
7282 let tokens: Vec<&str> = text.split_whitespace().collect();
7283 let mut labels = Vec::new();
7284 let mut i = 0usize;
7285 while i < tokens.len() {
7286 let token = tokens[i].trim_matches(|c: char| c == ',' || c == ';');
7287 if looks_like_year_token(token) {
7288 let mut label = token.to_string();
7289 if let Some(next) = tokens.get(i + 1) {
7290 let next_trimmed = next.trim_matches(|c: char| c == ',' || c == ';');
7291 if next_trimmed.starts_with('(') && next_trimmed.ends_with(')') {
7292 label.push(' ');
7293 label.push_str(next_trimmed);
7294 i += 1;
7295 }
7296 }
7297 labels.push(label);
7298 } else if looks_like_category_label(token) {
7299 labels.push(token.to_string());
7300 }
7301 i += 1;
7302 }
7303 labels
7304}
7305
7306fn derive_chart_series_values(tokens: &[String], expected_count: usize) -> Vec<String> {
7307 if expected_count == 0 {
7308 return Vec::new();
7309 }
7310
7311 if tokens.len() == expected_count {
7312 return tokens.to_vec();
7313 }
7314
7315 let numeric_values: Vec<i64> = tokens
7316 .iter()
7317 .filter_map(|token| parse_integer_token(token))
7318 .collect();
7319 if numeric_values.len() != tokens.len() {
7320 return Vec::new();
7321 }
7322
7323 let axis_series = detect_axis_progression(&numeric_values);
7324 if axis_series.is_empty() {
7325 return Vec::new();
7326 }
7327
7328 let mut remaining = Vec::new();
7329 let mut removable = axis_series;
7330 for token in tokens {
7331 let Some(value) = parse_integer_token(token) else {
7332 continue;
7333 };
7334 if let Some(pos) = removable.iter().position(|candidate| *candidate == value) {
7335 removable.remove(pos);
7336 } else {
7337 remaining.push(token.clone());
7338 }
7339 }
7340
7341 if remaining.len() == expected_count {
7342 remaining
7343 } else {
7344 Vec::new()
7345 }
7346}
7347
7348fn detect_axis_progression(values: &[i64]) -> Vec<i64> {
7349 if values.len() < 6 {
7350 return Vec::new();
7351 }
7352
7353 let mut sorted = values.to_vec();
7354 sorted.sort_unstable();
7355 sorted.dedup();
7356 if sorted.len() < 6 {
7357 return Vec::new();
7358 }
7359
7360 let mut best = Vec::new();
7361 for window in sorted.windows(2) {
7362 let step = window[1] - window[0];
7363 if step <= 0 {
7364 continue;
7365 }
7366
7367 let mut series = vec![window[0]];
7368 let mut current = window[0];
7369 loop {
7370 let next = current + step;
7371 if sorted.binary_search(&next).is_ok() {
7372 series.push(next);
7373 current = next;
7374 } else {
7375 break;
7376 }
7377 }
7378
7379 if series.len() > best.len() {
7380 best = series;
7381 }
7382 }
7383
7384 if best.len() >= 6 {
7385 best
7386 } else {
7387 Vec::new()
7388 }
7389}
7390
7391fn chart_value_header(caption: &str) -> String {
7392 let trimmed = caption.trim();
7393 let title = strip_structural_caption_prefix(trimmed);
7394
7395 let mut base = title.to_string();
7396 if let Some(idx) = base.rfind(" in ") {
7397 let tail = base[idx + 4..].trim();
7398 if tail.split_whitespace().count() <= 2
7399 && tail.chars().next().is_some_and(char::is_uppercase)
7400 {
7401 base.truncate(idx);
7402 }
7403 }
7404
7405 if let Some(start) = title.rfind('(') {
7406 if title.ends_with(')') {
7407 let unit = title[start + 1..title.len() - 1].trim();
7408 if let Some(idx) = base.rfind('(') {
7409 base.truncate(idx);
7410 }
7411 let normalized_unit = unit.strip_prefix("in ").unwrap_or(unit).trim();
7412 return format!("{} ({})", base.trim(), normalized_unit);
7413 }
7414 }
7415
7416 let trimmed = base.trim();
7417 if trimmed.is_empty() {
7418 "Value".to_string()
7419 } else {
7420 trimmed.to_string()
7421 }
7422}
7423
7424fn strip_structural_caption_prefix(text: &str) -> &str {
7425 let trimmed = text.trim();
7426 let mut parts = trimmed.splitn(3, ' ');
7427 let Some(first) = parts.next() else {
7428 return trimmed;
7429 };
7430 let Some(second) = parts.next() else {
7431 return trimmed;
7432 };
7433 let Some(rest) = parts.next() else {
7434 return trimmed;
7435 };
7436
7437 let first_lower = first.to_ascii_lowercase();
7438 if matches!(
7439 first_lower.as_str(),
7440 "figure" | "table" | "diagram" | "chart"
7441 ) && second
7442 .chars()
7443 .all(|ch| ch.is_ascii_digit() || matches!(ch, '.' | ':'))
7444 {
7445 rest.trim()
7446 } else {
7447 trimmed
7448 }
7449}
7450
7451fn looks_like_footer_banner(block: &str) -> bool {
7452 let trimmed = block.trim();
7453 if trimmed.contains('\n') || trimmed.len() < 8 {
7454 return false;
7455 }
7456
7457 let tokens: Vec<&str> = trimmed.split_whitespace().collect();
7458 if !(2..=6).contains(&tokens.len()) {
7459 return false;
7460 }
7461
7462 let Some(last) = tokens.last() else {
7463 return false;
7464 };
7465 if !last.chars().all(|ch| ch.is_ascii_digit()) {
7466 return false;
7467 }
7468
7469 tokens[..tokens.len() - 1].iter().all(|token| {
7470 matches!(
7471 token.to_ascii_lowercase().as_str(),
7472 "of" | "and" | "the" | "for" | "in" | "on"
7473 ) || token.chars().next().is_some_and(char::is_uppercase)
7474 })
7475}
7476
7477fn looks_like_caption_continuation(block: &str) -> bool {
7478 let trimmed = block.trim();
7479 !trimmed.is_empty()
7480 && trimmed.split_whitespace().count() <= 8
7481 && trimmed.chars().next().is_some_and(char::is_uppercase)
7482 && !trimmed.contains(':')
7483}
7484
7485fn collapse_inline_whitespace(text: &str) -> String {
7486 text.split_whitespace().collect::<Vec<_>>().join(" ")
7487}
7488
7489fn drop_isolated_noise_lines(markdown: &str) -> String {
7490 let lines: Vec<&str> = markdown.lines().collect();
7491 let mut kept = Vec::with_capacity(lines.len());
7492
7493 for (idx, line) in lines.iter().enumerate() {
7494 if should_drop_isolated_noise_line(&lines, idx) {
7495 continue;
7496 }
7497 kept.push(*line);
7498 }
7499
7500 let mut result = kept.join("\n");
7501 if markdown.ends_with('\n') {
7502 result.push('\n');
7503 }
7504 result
7505}
7506
7507fn should_drop_isolated_noise_line(lines: &[&str], idx: usize) -> bool {
7508 let trimmed = lines[idx].trim();
7509 if trimmed.len() != 1 {
7510 return false;
7511 }
7512
7513 let ch = trimmed.chars().next().unwrap_or_default();
7514 if !(ch.is_ascii_lowercase() || ch.is_ascii_digit()) {
7515 return false;
7516 }
7517
7518 let prev = previous_nonempty_line(lines, idx);
7519 let next = next_nonempty_line(lines, idx);
7520 let (Some(prev), Some(next)) = (prev, next) else {
7521 return false;
7522 };
7523
7524 is_substantive_markdown_line(prev) && is_substantive_markdown_line(next)
7525}
7526
7527fn previous_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
7528 lines[..idx]
7529 .iter()
7530 .rev()
7531 .find(|line| !line.trim().is_empty())
7532 .copied()
7533}
7534
7535fn next_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
7536 lines[idx + 1..]
7537 .iter()
7538 .find(|line| !line.trim().is_empty())
7539 .copied()
7540}
7541
7542fn is_substantive_markdown_line(line: &str) -> bool {
7543 let trimmed = line.trim();
7544 if trimmed.is_empty() {
7545 return false;
7546 }
7547
7548 if trimmed.starts_with('|') || trimmed.starts_with("- ") || trimmed.starts_with('#') {
7549 return true;
7550 }
7551
7552 trimmed.split_whitespace().count() >= 2
7553}
7554
7555fn normalize_common_ocr_text(text: &str) -> String {
7556 if text.is_empty() {
7557 return String::new();
7558 }
7559
7560 let mut normalized = text
7561 .replace("ߤL", "μL")
7562 .replace(" oC", "°C")
7563 .replace("37 C", "37°C")
7564 .replace("-20 oC", "-20°C")
7565 .replace("1- 20-μL", "1-20-μL")
7566 .replace("1- 20 μL", "1-20 μL")
7567 .replace("1- 2 0 μL", "1-20 μL")
7568 .replace("1- 2 0 μL", "1-20 μL");
7569
7570 normalized = normalize_degree_spacing(&normalized);
7571 collapse_inline_whitespace(&normalized)
7572}
7573
7574fn normalize_degree_spacing(text: &str) -> String {
7575 let chars: Vec<char> = text.chars().collect();
7576 let mut out = String::with_capacity(text.len());
7577 let mut i = 0usize;
7578 while i < chars.len() {
7579 let ch = chars[i];
7580 if ch == ' '
7581 && i > 0
7582 && i + 2 < chars.len()
7583 && chars[i - 1].is_ascii_digit()
7584 && matches!(chars[i + 1], 'C' | 'F')
7585 && !chars[i + 2].is_ascii_alphabetic()
7586 {
7587 out.push('°');
7588 out.push(chars[i + 1]);
7589 i += 2;
7590 continue;
7591 }
7592 out.push(ch);
7593 i += 1;
7594 }
7595 out
7596}
7597
7598fn normalize_list_text(text: &str) -> String {
7599 let normalized = normalize_common_ocr_text(text);
7600 let trimmed = normalized
7601 .trim_start_matches(|ch: char| is_bullet_like(ch))
7602 .trim();
7603 trimmed.to_string()
7604}
7605
7606fn push_rendered_list_item(out: &mut String, item: &str) {
7607 if starts_with_enumerated_marker(item) {
7608 out.push_str(item);
7609 out.push('\n');
7610 } else {
7611 out.push_str(&format!("- {}\n", item));
7612 }
7613}
7614
7615fn should_merge_list_continuation(previous: &str, current: &str) -> bool {
7616 let trimmed = current.trim();
7617 if trimmed.is_empty()
7618 || looks_like_stray_list_page_number(trimmed)
7619 || is_list_section_heading(trimmed)
7620 || looks_like_numbered_section(trimmed)
7621 || starts_with_enumerated_marker(trimmed)
7622 {
7623 return false;
7624 }
7625
7626 if previous.ends_with('-')
7627 && previous
7628 .chars()
7629 .rev()
7630 .nth(1)
7631 .is_some_and(|c| c.is_alphabetic())
7632 && trimmed.chars().next().is_some_and(char::is_lowercase)
7633 {
7634 return true;
7635 }
7636
7637 trimmed
7638 .chars()
7639 .next()
7640 .is_some_and(|ch| ch.is_ascii_lowercase() || matches!(ch, ',' | ';' | ')' | ']' | '%'))
7641}
7642
7643fn is_pure_bullet_marker(text: &str) -> bool {
7644 let trimmed = text.trim();
7645 !trimmed.is_empty() && trimmed.chars().all(is_bullet_like)
7646}
7647
7648fn looks_like_stray_list_page_number(text: &str) -> bool {
7649 let trimmed = text.trim();
7650 (1..=4).contains(&trimmed.len()) && trimmed.chars().all(|ch| ch.is_ascii_digit())
7651}
7652
7653fn is_bullet_like(ch: char) -> bool {
7654 matches!(
7655 ch,
7656 '•' | '◦'
7657 | '▪'
7658 | '▸'
7659 | '▹'
7660 | '►'
7661 | '▻'
7662 | '●'
7663 | '○'
7664 | '■'
7665 | '□'
7666 | '◆'
7667 | '◇'
7668 | '-'
7669 )
7670}
7671
7672fn looks_like_isolated_caption_context(caption: &str, next_block: &str) -> bool {
7673 let next = next_block.trim();
7674 if next.is_empty() {
7675 return false;
7676 }
7677
7678 let next_lower = next.to_ascii_lowercase();
7679 if next_lower.starts_with("source:")
7680 || next_lower.starts_with("note:")
7681 || next_lower.starts_with("*source:")
7682 || next_lower.starts_with("*note:")
7683 {
7684 return true;
7685 }
7686
7687 caption.split_whitespace().count() <= 14
7688 && next.split_whitespace().count() <= 45
7689 && (next.contains(':') || next.contains('='))
7690}
7691
7692fn looks_like_numeric_noise_block(block: &str) -> bool {
7693 let trimmed = block.trim();
7694 !trimmed.is_empty()
7695 && trimmed.split_whitespace().all(|token| {
7696 sanitize_numberish_token(token)
7697 .as_deref()
7698 .is_some_and(|sanitized| sanitized.chars().all(|ch| ch.is_ascii_digit()))
7699 })
7700}
7701
7702fn looks_like_yearish_label(label: &str) -> bool {
7703 label.chars().next().is_some_and(|ch| ch.is_ascii_digit())
7704}
7705
7706fn looks_like_year_token(token: &str) -> bool {
7707 token.len() == 4 && token.chars().all(|ch| ch.is_ascii_digit())
7708}
7709
7710fn looks_like_category_label(token: &str) -> bool {
7711 token
7712 .chars()
7713 .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '/' | '%'))
7714 && token.chars().any(|ch| ch.is_ascii_alphabetic())
7715}
7716
7717fn is_numberish_token(token: &str) -> bool {
7718 sanitize_numberish_token(token).is_some()
7719}
7720
7721fn sanitize_numberish_token(token: &str) -> Option<String> {
7722 let trimmed = token.trim_matches(|c: char| matches!(c, ',' | ';' | ':' | '.'));
7723 if trimmed.is_empty() {
7724 return None;
7725 }
7726
7727 let candidate = trimmed.trim_end_matches('%').replace(',', "");
7728 if candidate.chars().all(|ch| ch.is_ascii_digit()) {
7729 Some(trimmed.trim_end_matches([',', ';', ':']).to_string())
7730 } else {
7731 None
7732 }
7733}
7734
7735fn parse_integer_token(token: &str) -> Option<i64> {
7736 sanitize_numberish_token(token)?
7737 .replace(',', "")
7738 .parse::<i64>()
7739 .ok()
7740}
7741
7742fn starts_with_uppercase_word(text: &str) -> bool {
7743 for ch in text.trim_start().chars() {
7744 if ch.is_alphabetic() {
7745 return ch.is_uppercase();
7746 }
7747 if !matches!(ch, '"' | '\'' | '(' | '[') {
7748 break;
7749 }
7750 }
7751 false
7752}
7753
7754fn clean_paragraph_text(text: &str) -> String {
7757 let trimmed = text.trim();
7758 if trimmed.is_empty() {
7759 return String::new();
7760 }
7761 let mut result = String::with_capacity(trimmed.len());
7763 let mut prev_space = false;
7764 for ch in trimmed.chars() {
7765 if ch == ' ' || ch == '\t' {
7766 if !prev_space {
7767 result.push(' ');
7768 prev_space = true;
7769 }
7770 } else {
7771 result.push(ch);
7772 prev_space = false;
7773 }
7774 }
7775 normalize_common_ocr_text(&result)
7776}
7777
7778fn next_mergeable_paragraph_text(element: Option<&ContentElement>) -> Option<String> {
7779 match element {
7780 Some(ContentElement::Paragraph(p)) => {
7781 let text = clean_paragraph_text(&p.base.value());
7782 let trimmed = text.trim();
7783 if trimmed.is_empty()
7784 || should_render_element_as_heading(element.unwrap(), trimmed, None)
7785 {
7786 None
7787 } else {
7788 Some(trimmed.to_string())
7789 }
7790 }
7791 Some(ContentElement::TextBlock(tb)) => {
7792 let text = clean_paragraph_text(&tb.value());
7793 let trimmed = text.trim();
7794 if trimmed.is_empty()
7795 || should_render_element_as_heading(element.unwrap(), trimmed, None)
7796 {
7797 None
7798 } else {
7799 Some(trimmed.to_string())
7800 }
7801 }
7802 Some(ContentElement::TextLine(tl)) => {
7803 let text = clean_paragraph_text(&tl.value());
7804 let trimmed = text.trim();
7805 if trimmed.is_empty()
7806 || should_render_element_as_heading(element.unwrap(), trimmed, None)
7807 {
7808 None
7809 } else {
7810 Some(trimmed.to_string())
7811 }
7812 }
7813 _ => None,
7814 }
7815}
7816
7817fn should_render_paragraph_as_heading(
7818 doc: &PdfDocument,
7819 idx: usize,
7820 text: &str,
7821 next: Option<&ContentElement>,
7822) -> bool {
7823 if looks_like_top_margin_running_header(doc, idx, text) {
7824 return false;
7825 }
7826 if looks_like_hyphenated_table_title_continuation(doc, idx, text, next) {
7827 return true;
7828 }
7829 if should_render_element_as_heading(&doc.kids[idx], text, next) {
7830 return true;
7831 }
7832
7833 let body_font_size = compute_body_font_size(doc);
7836 if is_too_small_for_heading(&doc.kids, idx, body_font_size) {
7837 return false;
7838 }
7839
7840 if !doc_has_explicit_headings(doc) {
7842 if should_rescue_as_heading(doc, idx, text) {
7843 return true;
7844 }
7845 if should_rescue_allcaps_heading(doc, idx, text) {
7849 return true;
7850 }
7851 if should_rescue_numbered_heading(doc, idx, text) {
7852 return true;
7853 }
7854 return false;
7855 }
7856 if heading_density(doc) < 0.10 {
7859 if should_rescue_allcaps_heading(doc, idx, text) {
7860 return true;
7861 }
7862 if should_rescue_numbered_heading(doc, idx, text) {
7866 return true;
7867 }
7868 if body_font_size > 0.0 {
7873 if let ContentElement::Paragraph(p) = &doc.kids[idx] {
7874 if let Some(fs) = p.base.font_size {
7875 if fs >= 1.15 * body_font_size
7876 && is_heading_rescue_candidate(doc, idx, text)
7877 && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
7878 {
7879 return true;
7880 }
7881 }
7882 }
7883 }
7884 }
7885 false
7886}
7887
7888fn doc_has_explicit_headings(doc: &PdfDocument) -> bool {
7890 doc.kids.iter().any(|e| {
7891 matches!(
7892 e,
7893 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
7894 )
7895 })
7896}
7897
7898fn compute_body_font_size(doc: &PdfDocument) -> f64 {
7903 let mut font_sizes: Vec<f64> = doc
7904 .kids
7905 .iter()
7906 .filter_map(|e| {
7907 if let ContentElement::Paragraph(p) = e {
7908 let word_count = p.base.value().split_whitespace().count();
7909 if word_count > 10 {
7910 p.base.font_size
7911 } else {
7912 None
7913 }
7914 } else {
7915 None
7916 }
7917 })
7918 .collect();
7919 if font_sizes.is_empty() {
7920 return 0.0;
7921 }
7922 font_sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
7923 font_sizes[font_sizes.len() / 2]
7924}
7925
7926fn is_too_small_for_heading(doc_kids: &[ContentElement], idx: usize, body_font_size: f64) -> bool {
7931 if body_font_size <= 0.0 {
7932 return false;
7933 }
7934 if let ContentElement::Paragraph(p) = &doc_kids[idx] {
7935 if let Some(fs) = p.base.font_size {
7936 return fs < 0.95 * body_font_size;
7937 }
7938 }
7939 false
7940}
7941
7942fn heading_density(doc: &PdfDocument) -> f64 {
7944 let total = doc.kids.len();
7945 if total == 0 {
7946 return 0.0;
7947 }
7948 let heading_count = doc
7949 .kids
7950 .iter()
7951 .filter(|e| {
7952 matches!(
7953 e,
7954 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
7955 )
7956 })
7957 .count();
7958 heading_count as f64 / total as f64
7959}
7960
7961fn should_rescue_as_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
7964 is_heading_rescue_candidate(doc, idx, text)
7965 && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
7966}
7967
7968fn is_heading_rescue_candidate(doc: &PdfDocument, idx: usize, text: &str) -> bool {
7972 let trimmed = text.trim();
7973 if trimmed.is_empty() {
7974 return false;
7975 }
7976
7977 let has_alpha = trimmed.chars().any(char::is_alphabetic);
7978
7979 if !has_alpha || trimmed.ends_with(['.', '!', '?', ';', ',']) {
7981 return false;
7982 }
7983
7984 if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
7986 return false;
7987 }
7988
7989 if trimmed.starts_with('(') && trimmed.ends_with(')') {
7991 return false;
7992 }
7993
7994 if starts_with_caption_prefix(trimmed)
7996 || looks_like_chart_label_heading(&doc.kids[idx], trimmed)
7997 {
7998 return false;
7999 }
8000
8001 let word_count = trimmed.split_whitespace().count();
8003 if word_count > 6 || trimmed.len() > 60 {
8004 return false;
8005 }
8006
8007 if trimmed
8009 .chars()
8010 .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
8011 {
8012 return false;
8013 }
8014
8015 if let Some(first_alpha) = trimmed.chars().find(|c| c.is_alphabetic()) {
8017 if first_alpha.is_lowercase() {
8018 return false;
8019 }
8020 }
8021
8022 true
8023}
8024
8025fn has_substantive_follow_up(
8029 doc: &PdfDocument,
8030 idx: usize,
8031 word_count: usize,
8032 max_lookahead: usize,
8033) -> bool {
8034 for offset in 1..=max_lookahead {
8035 let lookahead_idx = idx + offset;
8036 if lookahead_idx >= doc.kids.len() {
8037 break;
8038 }
8039 let look_elem = &doc.kids[lookahead_idx];
8040 match look_elem {
8041 ContentElement::Paragraph(p) => {
8042 let next_text = p.base.value();
8043 let nw = next_text.split_whitespace().count();
8044 if nw >= word_count * 3 || nw > 15 {
8045 return true;
8046 }
8047 }
8048 ContentElement::TextBlock(tb) => {
8049 let next_text = tb.value();
8050 let nw = next_text.split_whitespace().count();
8051 if nw >= word_count * 3 || nw > 15 {
8052 return true;
8053 }
8054 }
8055 ContentElement::TextLine(tl) => {
8056 let next_text = tl.value();
8057 let nw = next_text.split_whitespace().count();
8058 if nw >= word_count * 3 || nw > 15 {
8059 return true;
8060 }
8061 }
8062 ContentElement::List(_)
8063 | ContentElement::Table(_)
8064 | ContentElement::TableBorder(_)
8065 | ContentElement::Image(_)
8066 | ContentElement::Figure(_) => {
8067 return true;
8068 }
8069 _ => continue,
8070 }
8071 }
8072
8073 false
8074}
8075
8076fn should_rescue_numbered_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8079 let trimmed = text.trim();
8080 if trimmed.is_empty() || trimmed.len() > 100 {
8081 return false;
8082 }
8083
8084 if !looks_like_numbered_section(trimmed) {
8087 return false;
8088 }
8089
8090 if trimmed.ends_with(['!', '?', ';', ',']) {
8094 return false;
8095 }
8096 if trimmed.ends_with('.') && !looks_like_keyword_numbered_section(trimmed) {
8097 return false;
8098 }
8099 if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8101 return false;
8102 }
8103
8104 for offset in 1..=3 {
8106 let lookahead_idx = idx + offset;
8107 if lookahead_idx >= doc.kids.len() {
8108 break;
8109 }
8110 match &doc.kids[lookahead_idx] {
8111 ContentElement::Paragraph(p) => {
8112 let nw = p.base.value().split_whitespace().count();
8113 if nw > 10 {
8114 return true;
8115 }
8116 }
8117 ContentElement::TextBlock(tb) => {
8118 let nw = tb.value().split_whitespace().count();
8119 if nw > 10 {
8120 return true;
8121 }
8122 }
8123 ContentElement::TextLine(tl) => {
8124 let nw = tl.value().split_whitespace().count();
8125 if nw > 10 {
8126 return true;
8127 }
8128 }
8129 ContentElement::List(_)
8130 | ContentElement::Table(_)
8131 | ContentElement::TableBorder(_)
8132 | ContentElement::Image(_)
8133 | ContentElement::Figure(_) => {
8134 return true;
8135 }
8136 _ => continue,
8137 }
8138 }
8139
8140 false
8141}
8142
8143fn looks_like_numbered_section(text: &str) -> bool {
8146 let bytes = text.as_bytes();
8147 if bytes.is_empty() {
8148 return false;
8149 }
8150
8151 let mut idx = 0;
8153 if bytes[0].is_ascii_digit() {
8154 while idx < bytes.len() && bytes[idx].is_ascii_digit() {
8155 idx += 1;
8156 }
8157 if idx >= bytes.len() {
8158 return false;
8159 }
8160 while idx < bytes.len() && bytes[idx] == b'.' {
8162 idx += 1;
8163 let start = idx;
8164 while idx < bytes.len() && bytes[idx].is_ascii_digit() {
8165 idx += 1;
8166 }
8167 if idx == start {
8168 break;
8170 }
8171 }
8172 if idx >= bytes.len() {
8174 return false;
8175 }
8176 if bytes[idx] == b' ' || bytes[idx] == b'\t' {
8178 idx += 1;
8179 if idx < bytes.len() && bytes[idx] == b'-' {
8181 idx += 1;
8182 if idx < bytes.len() && bytes[idx] == b' ' {
8183 idx += 1;
8184 }
8185 }
8186 } else if bytes[idx] == b'-' {
8187 idx += 1;
8188 if idx < bytes.len() && bytes[idx] == b' ' {
8189 idx += 1;
8190 }
8191 } else {
8192 return false;
8193 }
8194 let rest = &text[idx..].trim();
8196 if rest.is_empty() {
8197 return false;
8198 }
8199 if let Some(c) = rest.chars().find(|c| c.is_alphabetic()) {
8201 return c.is_uppercase();
8202 }
8203 return false;
8204 }
8205
8206 if looks_like_keyword_numbered_section(text) {
8208 return true;
8209 }
8210
8211 false
8212}
8213
8214const SECTION_KEYWORDS: &[&str] = &[
8216 "activity",
8217 "appendix",
8218 "case",
8219 "chapter",
8220 "exercise",
8221 "experiment",
8222 "lab",
8223 "lesson",
8224 "module",
8225 "part",
8226 "phase",
8227 "problem",
8228 "question",
8229 "section",
8230 "stage",
8231 "step",
8232 "task",
8233 "topic",
8234 "unit",
8235];
8236
8237fn looks_like_keyword_numbered_section(text: &str) -> bool {
8239 let trimmed = text.trim();
8240 let space_pos = match trimmed.find(' ') {
8242 Some(p) => p,
8243 None => return false,
8244 };
8245 let keyword = &trimmed[..space_pos];
8246 if !SECTION_KEYWORDS
8247 .iter()
8248 .any(|k| keyword.eq_ignore_ascii_case(k))
8249 {
8250 return false;
8251 }
8252 let rest = trimmed[space_pos + 1..].trim_start();
8254 if rest.is_empty() {
8255 return false;
8256 }
8257 let rest = rest.strip_prefix('#').unwrap_or(rest);
8258 let first_char = rest.chars().next().unwrap_or(' ');
8260 if !first_char.is_ascii_digit() && !matches!(first_char, 'I' | 'V' | 'X' | 'L') {
8261 return false;
8262 }
8263 true
8264}
8265
8266fn should_rescue_allcaps_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8269 let trimmed = text.trim();
8270 if trimmed.is_empty() {
8271 return false;
8272 }
8273
8274 let word_count = trimmed.split_whitespace().count();
8275
8276 if word_count > 8 || trimmed.len() > 80 {
8278 return false;
8279 }
8280
8281 let alpha_chars: Vec<char> = trimmed.chars().filter(|c| c.is_alphabetic()).collect();
8283 if alpha_chars.len() < 2 || !alpha_chars.iter().all(|c| c.is_uppercase()) {
8284 return false;
8285 }
8286
8287 if trimmed.ends_with(['.', ';', ',']) {
8289 return false;
8290 }
8291
8292 if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8294 return false;
8295 }
8296
8297 if starts_with_caption_prefix(trimmed) {
8299 return false;
8300 }
8301
8302 if trimmed
8304 .chars()
8305 .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
8306 {
8307 return false;
8308 }
8309
8310 for offset in 1..=4 {
8313 let lookahead_idx = idx + offset;
8314 if lookahead_idx >= doc.kids.len() {
8315 break;
8316 }
8317 let look_elem = &doc.kids[lookahead_idx];
8318 match look_elem {
8319 ContentElement::Paragraph(p) => {
8320 let nw = p.base.value().split_whitespace().count();
8321 if nw > 6 {
8322 return true;
8323 }
8324 }
8325 ContentElement::TextBlock(tb) => {
8326 let nw = tb.value().split_whitespace().count();
8327 if nw > 6 {
8328 return true;
8329 }
8330 }
8331 ContentElement::TextLine(tl) => {
8332 let nw = tl.value().split_whitespace().count();
8333 if nw > 6 {
8334 return true;
8335 }
8336 }
8337 ContentElement::List(_)
8338 | ContentElement::Table(_)
8339 | ContentElement::TableBorder(_)
8340 | ContentElement::Image(_)
8341 | ContentElement::Figure(_) => {
8342 return true;
8343 }
8344 _ => continue,
8345 }
8346 }
8347
8348 false
8349}
8350
8351fn should_render_element_as_heading(
8352 element: &ContentElement,
8353 text: &str,
8354 next: Option<&ContentElement>,
8355) -> bool {
8356 let trimmed = text.trim();
8357 if trimmed.is_empty() {
8358 return false;
8359 }
8360
8361 let lower = trimmed.to_ascii_lowercase();
8362 if matches!(lower.as_str(), "contents" | "table of contents")
8363 && trimmed.starts_with(|c: char| c.is_uppercase())
8364 {
8365 return true;
8366 }
8367
8368 let word_count = trimmed.split_whitespace().count();
8369 let has_alpha = trimmed.chars().any(char::is_alphabetic);
8370 let title_like = has_alpha
8371 && word_count <= 4
8372 && trimmed.len() <= 40
8373 && !trimmed.ends_with(['.', '!', '?', ';', ':']);
8374
8375 let is_attribution = {
8379 let lower = trimmed.to_ascii_lowercase();
8380 lower.starts_with("source:")
8381 || lower.starts_with("credit:")
8382 || lower.starts_with("photo by ")
8383 || lower.starts_with("photo credit")
8384 || lower.starts_with("image by ")
8385 || lower.starts_with("image credit")
8386 };
8387
8388 title_like
8389 && matches!(next, Some(ContentElement::List(_)))
8390 && !looks_like_chart_label_heading(element, trimmed)
8391 && !is_attribution
8392}
8393
8394fn looks_like_hyphenated_table_title_continuation(
8395 doc: &PdfDocument,
8396 idx: usize,
8397 text: &str,
8398 next: Option<&ContentElement>,
8399) -> bool {
8400 if !matches!(
8401 next,
8402 Some(ContentElement::Table(_)) | Some(ContentElement::TableBorder(_))
8403 ) {
8404 return false;
8405 }
8406
8407 let trimmed = text.trim();
8408 if trimmed.is_empty()
8409 || starts_with_caption_prefix(trimmed)
8410 || looks_like_numbered_section(trimmed)
8411 || looks_like_keyword_numbered_section(trimmed)
8412 || !trimmed.ends_with(':')
8413 {
8414 return false;
8415 }
8416
8417 let word_count = trimmed.split_whitespace().count();
8418 if !(3..=5).contains(&word_count) || trimmed.len() > 60 {
8419 return false;
8420 }
8421
8422 let Some(first_alpha) = trimmed.chars().find(|ch| ch.is_alphabetic()) else {
8423 return false;
8424 };
8425 if first_alpha.is_lowercase() {
8426 return false;
8427 }
8428
8429 let Some(prev_idx) = idx.checked_sub(1) else {
8430 return false;
8431 };
8432 let prev_text = extract_element_text(&doc.kids[prev_idx]);
8433 let prev_trimmed = prev_text.trim();
8434 !prev_trimmed.is_empty() && prev_trimmed.ends_with('-')
8435}
8436
8437fn looks_like_table_header_duplicate_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8438 let trimmed = text.trim();
8439 if trimmed.is_empty()
8440 || starts_with_caption_prefix(trimmed)
8441 || looks_like_numbered_section(trimmed)
8442 || looks_like_keyword_numbered_section(trimmed)
8443 {
8444 return false;
8445 }
8446
8447 let word_count = trimmed.split_whitespace().count();
8448 if !(3..=10).contains(&word_count) || trimmed.len() > 96 {
8449 return false;
8450 }
8451
8452 let Some(prev_idx) = idx.checked_sub(1) else {
8453 return false;
8454 };
8455 let Some(previous_table) = table_border_from_element(&doc.kids[prev_idx]) else {
8456 return false;
8457 };
8458 if previous_table.num_columns < 3 || previous_table.rows.len() < 3 {
8459 return false;
8460 }
8461
8462 let mut rendered_rows = collect_table_border_rows(previous_table);
8463 if rendered_rows.is_empty() {
8464 return false;
8465 }
8466 merge_continuation_rows(&mut rendered_rows);
8467 trim_leading_table_carryover_rows(&mut rendered_rows);
8468
8469 let Some(header_row) = rendered_rows.first() else {
8470 return false;
8471 };
8472 let header_text = header_row
8473 .iter()
8474 .map(|cell| cell.trim())
8475 .filter(|cell| !cell.is_empty())
8476 .collect::<Vec<_>>()
8477 .join(" ");
8478 if !equivalent_heading_text(trimmed, &header_text) {
8479 return false;
8480 }
8481
8482 let page_number = doc.kids[idx].page_number();
8483 let mut short_fragments = 0usize;
8484 let mut numeric_fragments = 0usize;
8485
8486 for candidate in doc.kids.iter().skip(idx + 1) {
8487 if candidate.page_number() != page_number {
8488 break;
8489 }
8490 if matches!(
8491 candidate,
8492 ContentElement::Table(_) | ContentElement::TableBorder(_)
8493 ) {
8494 break;
8495 }
8496
8497 let fragment = extract_element_text(candidate);
8498 let fragment_trimmed = fragment.trim();
8499 if fragment_trimmed.is_empty()
8500 || looks_like_margin_page_number(doc, candidate, fragment_trimmed)
8501 {
8502 continue;
8503 }
8504
8505 let fragment_words = fragment_trimmed.split_whitespace().count();
8506 if fragment_words > 6 {
8507 return false;
8508 }
8509
8510 short_fragments += 1;
8511 if fragment_trimmed.chars().any(|ch| ch.is_ascii_digit()) {
8512 numeric_fragments += 1;
8513 }
8514
8515 if short_fragments >= 3 {
8516 break;
8517 }
8518 }
8519
8520 short_fragments >= 2 && numeric_fragments >= 1
8521}
8522
8523fn looks_like_top_margin_running_header(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8524 let trimmed = text.trim();
8525 if trimmed.is_empty() || trimmed.split_whitespace().count() > 6 {
8526 return false;
8527 }
8528
8529 let element = &doc.kids[idx];
8530 let bbox = element.bbox();
8531 if bbox.height() > 24.0 {
8532 return false;
8533 }
8534
8535 let Some(page) = element.page_number() else {
8536 return false;
8537 };
8538
8539 let mut page_tops = std::collections::HashMap::<u32, f64>::new();
8541 for candidate in &doc.kids {
8542 if let Some(p) = candidate.page_number() {
8543 let top = page_tops.entry(p).or_insert(f64::MIN);
8544 *top = top.max(candidate.bbox().top_y);
8545 }
8546 }
8547
8548 let page_top = page_tops.get(&page).copied().unwrap_or(0.0);
8549 if bbox.top_y < page_top - 24.0 {
8550 return false;
8551 }
8552
8553 let trimmed_lower = trimmed.to_lowercase();
8557 for other_elem in &doc.kids {
8558 let Some(other_page) = other_elem.page_number() else {
8559 continue;
8560 };
8561 if other_page == page {
8562 continue;
8563 }
8564 let other_bbox = other_elem.bbox();
8565 if other_bbox.height() > 24.0 {
8566 continue;
8567 }
8568 let other_top = page_tops.get(&other_page).copied().unwrap_or(0.0);
8569 if other_bbox.top_y < other_top - 24.0 {
8570 continue;
8571 }
8572 let other_text = match other_elem {
8573 ContentElement::Paragraph(p) => p.base.value(),
8574 ContentElement::TextBlock(tb) => tb.value(),
8575 ContentElement::TextLine(tl) => tl.value(),
8576 ContentElement::Heading(h) => h.base.base.value(),
8577 _ => continue,
8578 };
8579 if other_text.trim().to_lowercase() == trimmed_lower {
8580 return true;
8581 }
8582 }
8583
8584 false
8585}
8586
8587fn looks_like_chart_label_heading(element: &ContentElement, text: &str) -> bool {
8588 let trimmed = text.trim();
8589 let upper_words = trimmed
8590 .split_whitespace()
8591 .filter(|word| word.chars().any(char::is_alphabetic))
8592 .all(|word| {
8593 word.chars()
8594 .filter(|ch| ch.is_alphabetic())
8595 .all(|ch| ch.is_uppercase())
8596 });
8597
8598 (trimmed.contains('%') || upper_words) && element.bbox().height() <= 40.0
8599}
8600
8601fn should_demote_heading_to_paragraph(text: &str, next: &str) -> bool {
8602 let next_trimmed = next.trim();
8603 if !next_trimmed.chars().next().is_some_and(char::is_lowercase) {
8604 return false;
8605 }
8606
8607 let normalized = normalize_heading_text(text);
8608 if matches!(
8609 normalized.as_str(),
8610 "contents" | "tableofcontents" | "introduction" | "conclusion"
8611 ) {
8612 return false;
8613 }
8614
8615 let words: Vec<&str> = text.split_whitespace().collect();
8616 if words.len() < 3 {
8617 return false;
8618 }
8619
8620 words
8621 .last()
8622 .is_some_and(|word| is_sentence_fragment_tail(word))
8623}
8624
8625fn is_sentence_fragment_tail(word: &str) -> bool {
8626 matches!(
8627 word.trim_matches(|c: char| !c.is_alphanumeric())
8628 .to_ascii_lowercase()
8629 .as_str(),
8630 "a" | "an"
8631 | "and"
8632 | "as"
8633 | "at"
8634 | "by"
8635 | "for"
8636 | "from"
8637 | "in"
8638 | "into"
8639 | "of"
8640 | "on"
8641 | "or"
8642 | "that"
8643 | "the"
8644 | "to"
8645 | "with"
8646 )
8647}
8648
8649fn is_list_section_heading(text: &str) -> bool {
8650 let trimmed = text.trim();
8651 trimmed.ends_with(':')
8652 && trimmed.len() <= 80
8653 && trimmed.split_whitespace().count() <= 8
8654 && trimmed.chars().any(char::is_alphabetic)
8655 && !trimmed.chars().next().is_some_and(|c| c.is_ascii_digit())
8656 && !trimmed.starts_with(|c: char| "•‣◦●○◆◇▪▫–—-".contains(c))
8657}
8658
8659fn should_merge_paragraph_text(prev: &str, next: &str) -> bool {
8660 let next_trimmed = next.trim();
8661 if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
8662 return false;
8663 }
8664
8665 if starts_with_enumerated_marker(next_trimmed) {
8666 return false;
8667 }
8668
8669 if prev.ends_with('-')
8670 && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
8671 && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8672 {
8673 return true;
8674 }
8675
8676 if next_trimmed.chars().next().is_some_and(char::is_lowercase) {
8677 return true;
8678 }
8679
8680 let lower = next_trimmed.to_ascii_lowercase();
8681 if lower.starts_with("http://")
8682 || lower.starts_with("https://")
8683 || lower.starts_with("arxiv")
8684 || lower.starts_with("doi:")
8685 {
8686 return true;
8687 }
8688
8689 if matches!(
8690 next_trimmed.split_whitespace().next(),
8691 Some("In" | "Proceedings" | "Advances" | "Learning")
8692 ) {
8693 return true;
8694 }
8695
8696 !prev.ends_with(['.', '!', '?', ':'])
8697}
8698
8699fn should_merge_adjacent_semantic_paragraphs(prev: &str, next: &str) -> bool {
8700 let next_trimmed = next.trim();
8701 if next_trimmed.is_empty() {
8702 return false;
8703 }
8704
8705 if starts_with_enumerated_marker(next_trimmed) {
8706 return false;
8707 }
8708
8709 if prev.ends_with('-')
8710 && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
8711 && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8712 {
8713 return true;
8714 }
8715
8716 next_trimmed.chars().next().is_some_and(char::is_lowercase)
8717}
8718
8719fn starts_with_enumerated_marker(text: &str) -> bool {
8720 let first_token = match text.split_whitespace().next() {
8721 Some(token) => token.trim_start_matches(['(', '[']),
8722 None => return false,
8723 };
8724 if !first_token.ends_with(['.', ')', ':']) {
8725 return false;
8726 }
8727
8728 let marker = first_token.trim_end_matches(['.', ')', ':']);
8729 if marker.is_empty() {
8730 return false;
8731 }
8732
8733 if marker.chars().all(|c| c.is_ascii_digit()) {
8734 return true;
8735 }
8736
8737 if marker.len() == 1 && marker.chars().all(|c| c.is_ascii_alphabetic()) {
8738 return true;
8739 }
8740
8741 let lower = marker.to_ascii_lowercase();
8742 lower.len() <= 8 && lower.chars().all(|c| "ivxlcdm".contains(c))
8743}
8744
8745fn should_skip_leading_figure_carryover(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8746 let trimmed = text.trim();
8747 if !trimmed.starts_with("Figure ") || trimmed.split_whitespace().count() < 4 {
8748 return false;
8749 }
8750
8751 let element = &doc.kids[idx];
8752 let Some(page) = element.page_number() else {
8753 return false;
8754 };
8755
8756 let mut page_top = f64::MIN;
8757 for candidate in &doc.kids {
8758 if candidate.page_number() == Some(page)
8759 && matches!(
8760 candidate,
8761 ContentElement::Paragraph(_)
8762 | ContentElement::TextBlock(_)
8763 | ContentElement::TextLine(_)
8764 | ContentElement::Heading(_)
8765 | ContentElement::NumberHeading(_)
8766 | ContentElement::Caption(_)
8767 )
8768 {
8769 page_top = page_top.max(candidate.bbox().top_y);
8770 }
8771 }
8772 if !page_top.is_finite() || element.bbox().top_y < page_top - 72.0 {
8773 return false;
8774 }
8775
8776 for prior_idx in 0..idx {
8777 let prior = &doc.kids[prior_idx];
8778 let prior_text = extract_element_text(prior);
8779 let prior_trimmed = prior_text.trim();
8780 if prior_trimmed.is_empty()
8781 || is_standalone_page_number(prior_trimmed)
8782 || looks_like_footer_banner(prior_trimmed)
8783 {
8784 continue;
8785 }
8786 match prior {
8787 ContentElement::Paragraph(_)
8788 | ContentElement::TextBlock(_)
8789 | ContentElement::TextLine(_) => {
8790 if !starts_with_caption_prefix(prior_trimmed)
8791 && !looks_like_top_margin_running_header(doc, prior_idx, prior_trimmed)
8792 {
8793 return false;
8794 }
8795 }
8796 ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
8797 if !should_skip_heading_text(prior_trimmed) {
8798 return false;
8799 }
8800 }
8801 _ => return false,
8802 }
8803 }
8804
8805 for lookahead_idx in idx + 1..doc.kids.len().min(idx + 8) {
8806 let next = &doc.kids[lookahead_idx];
8807 if next.page_number() != Some(page) {
8808 break;
8809 }
8810 let next_text = extract_element_text(next);
8811 let next_trimmed = next_text.trim();
8812 if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
8813 continue;
8814 }
8815
8816 let is_numbered_heading = match next {
8817 ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
8818 looks_like_numbered_section(next_trimmed)
8819 || looks_like_keyword_numbered_section(next_trimmed)
8820 }
8821 ContentElement::Paragraph(_)
8822 | ContentElement::TextBlock(_)
8823 | ContentElement::TextLine(_) => {
8824 should_render_paragraph_as_heading(
8825 doc,
8826 lookahead_idx,
8827 next_trimmed,
8828 doc.kids.get(lookahead_idx + 1),
8829 ) && (looks_like_numbered_section(next_trimmed)
8830 || looks_like_keyword_numbered_section(next_trimmed))
8831 }
8832 _ => false,
8833 };
8834
8835 if is_numbered_heading {
8836 return true;
8837 }
8838
8839 if !starts_with_caption_prefix(next_trimmed) && next_trimmed.split_whitespace().count() >= 5
8840 {
8841 return false;
8842 }
8843 }
8844
8845 false
8846}
8847
8848fn merge_paragraph_text(target: &mut String, next: &str) {
8849 let next_trimmed = next.trim();
8850 if target.ends_with('-')
8851 && target
8852 .chars()
8853 .rev()
8854 .nth(1)
8855 .is_some_and(|c| c.is_alphabetic())
8856 && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8857 {
8858 target.pop();
8859 target.push_str(next_trimmed);
8860 } else {
8861 if !target.ends_with(' ') {
8862 target.push(' ');
8863 }
8864 target.push_str(next_trimmed);
8865 }
8866}
8867
8868fn is_standalone_page_number(text: &str) -> bool {
8869 let trimmed = text.trim();
8870 !trimmed.is_empty() && trimmed.len() <= 4 && trimmed.chars().all(|c| c.is_ascii_digit())
8871}
8872
8873fn looks_like_margin_page_number(doc: &PdfDocument, element: &ContentElement, text: &str) -> bool {
8874 if !is_standalone_page_number(text) {
8875 return false;
8876 }
8877
8878 let bbox = element.bbox();
8879 if bbox.height() > 24.0 {
8880 return false;
8881 }
8882
8883 let Some(page) = element.page_number() else {
8884 return false;
8885 };
8886
8887 let mut page_top = f64::MIN;
8888 let mut page_bottom = f64::MAX;
8889 for candidate in &doc.kids {
8890 if candidate.page_number() == Some(page) {
8891 let candidate_bbox = candidate.bbox();
8892 page_top = page_top.max(candidate_bbox.top_y);
8893 page_bottom = page_bottom.min(candidate_bbox.bottom_y);
8894 }
8895 }
8896
8897 if !page_top.is_finite() || !page_bottom.is_finite() {
8898 return false;
8899 }
8900
8901 bbox.top_y >= page_top - 24.0 || bbox.bottom_y <= page_bottom + 24.0
8902}
8903
8904fn looks_like_bottom_margin_heading(doc: &PdfDocument, idx: usize) -> bool {
8909 let element = &doc.kids[idx];
8910 let bbox = element.bbox();
8911 if bbox.height() > 30.0 {
8912 return false;
8913 }
8914
8915 let Some(page) = element.page_number() else {
8916 return false;
8917 };
8918
8919 let mut page_bottom = f64::MAX;
8920 for candidate in &doc.kids {
8921 if candidate.page_number() == Some(page) {
8922 page_bottom = page_bottom.min(candidate.bbox().bottom_y);
8923 }
8924 }
8925
8926 if !page_bottom.is_finite() {
8927 return false;
8928 }
8929
8930 bbox.bottom_y <= page_bottom + 24.0
8932}
8933
8934fn should_demote_period_heading(text: &str) -> bool {
8938 let trimmed = text.trim();
8939 if !trimmed.ends_with('.') {
8940 return false;
8941 }
8942 if looks_like_numbered_section(trimmed) || looks_like_keyword_numbered_section(trimmed) {
8945 return false;
8946 }
8947 let without_dot = trimmed.trim_end_matches('.');
8951 let word_count = without_dot.split_whitespace().count();
8952 if word_count <= 2 {
8955 return true;
8956 }
8957 false
8958}
8959
8960fn should_demote_comma_heading(text: &str) -> bool {
8963 text.trim().ends_with(',')
8964}
8965
8966fn should_demote_math_heading(text: &str) -> bool {
8969 text.chars().any(|c| {
8970 matches!(
8971 c,
8972 '¼' | '½'
8973 | '¾'
8974 | '≪'
8975 | '≫'
8976 | 'þ'
8977 | 'ð'
8978 | '∑'
8979 | '∫'
8980 | '∂'
8981 | '∏'
8982 | '√'
8983 | '∞'
8984 | '≈'
8985 | '÷'
8986 )
8987 })
8988}
8989
8990fn should_demote_percentage_heading(text: &str) -> bool {
8993 text.contains('%')
8994}
8995
8996fn should_demote_bibliography_heading(text: &str) -> bool {
8999 let t = text.trim();
9000 if t.len() < 6 {
9001 return false;
9002 }
9003 let bytes = t.as_bytes();
9004 bytes[0..4].iter().all(|b| b.is_ascii_digit())
9005 && bytes[4] == b'.'
9006 && (bytes[5] == b' ' || t.len() == 5)
9007}
9008
9009fn strip_trailing_page_number(text: &str) -> &str {
9014 let trimmed = text.trim();
9015 if let Some(last_space) = trimmed.rfind(' ') {
9016 let suffix = &trimmed[last_space + 1..];
9017 if !suffix.is_empty()
9018 && suffix.len() <= 4
9019 && suffix.chars().all(|c| c.is_ascii_digit())
9020 && trimmed[..last_space].split_whitespace().count() >= 3
9021 {
9022 return trimmed[..last_space].trim();
9023 }
9024 }
9025 trimmed
9026}
9027
9028fn find_merged_subsection_split(text: &str) -> Option<usize> {
9033 let bytes = text.as_bytes();
9036 let mut i = 3;
9038 while i < bytes.len() {
9039 if bytes[i - 1] == b' ' {
9040 if bytes[i].is_ascii_digit() {
9042 if let Some(dot_pos) = text[i..].find('.') {
9043 let after_dot = i + dot_pos + 1;
9044 if after_dot < bytes.len() && bytes[after_dot].is_ascii_digit() {
9045 return Some(i);
9047 }
9048 }
9049 }
9050 if bytes[i].is_ascii_uppercase()
9052 && i + 2 < bytes.len()
9053 && bytes[i + 1] == b'.'
9054 && bytes[i + 2].is_ascii_digit()
9055 {
9056 return Some(i);
9057 }
9058 }
9059 i += 1;
9060 }
9061 None
9062}
9063
9064fn should_skip_heading_text(text: &str) -> bool {
9065 let trimmed = text.trim();
9066 if trimmed.is_empty() || is_standalone_page_number(trimmed) {
9067 return true;
9068 }
9069
9070 let lower = trimmed.to_ascii_lowercase();
9071 if (lower.starts_with("chapter ") || lower.chars().next().is_some_and(|c| c.is_ascii_digit()))
9072 && trimmed.contains('|')
9073 {
9074 return true;
9075 }
9076
9077 let alpha_count = trimmed.chars().filter(|c| c.is_alphabetic()).count();
9078 let alnum_count = trimmed.chars().filter(|c| c.is_alphanumeric()).count();
9079 alpha_count == 0 || (alnum_count > 0 && alpha_count * 3 < alnum_count && !trimmed.contains(':'))
9080}
9081
9082fn repair_fragmented_words(text: &str) -> String {
9083 const STOPWORDS: &[&str] = &[
9084 "a", "an", "and", "are", "as", "at", "be", "by", "can", "for", "from", "if", "in", "into",
9085 "is", "it", "may", "must", "not", "of", "on", "or", "per", "that", "the", "to", "with",
9086 ];
9087
9088 let mut parts: Vec<String> = text.split_whitespace().map(str::to_string).collect();
9089 if parts.len() < 2 {
9090 return text.to_string();
9091 }
9092
9093 let mut i = 0usize;
9094 while i + 1 < parts.len() {
9095 let left = parts[i].clone();
9096 let right = parts[i + 1].clone();
9097 let left_clean = left.trim_matches(|c: char| !c.is_alphabetic());
9098 let right_clean = right.trim_matches(|c: char| !c.is_alphabetic());
9099 let left_lower = left_clean.to_ascii_lowercase();
9100 let right_lower = right_clean.to_ascii_lowercase();
9101
9102 let should_join = !left_clean.is_empty()
9103 && !right_clean.is_empty()
9104 && left_clean.chars().all(char::is_alphabetic)
9105 && right_clean.chars().all(char::is_alphabetic)
9106 && (left_clean.len() <= 4 || right_clean.len() <= 4)
9107 && left_clean.len() + right_clean.len() >= 6
9108 && !right_clean.chars().next().is_some_and(char::is_uppercase)
9109 && !STOPWORDS.contains(&left_lower.as_str())
9110 && !STOPWORDS.contains(&right_lower.as_str());
9111
9112 if should_join {
9113 let next = parts.remove(i + 1);
9114 parts[i].push_str(&next);
9115 } else {
9116 i += 1;
9117 }
9118 }
9119
9120 parts.join(" ")
9121}
9122
9123fn list_item_text_from_contents(contents: &[ContentElement]) -> String {
9125 let mut text = String::new();
9126 for elem in contents {
9127 let part = match elem {
9128 ContentElement::Paragraph(p) => p.base.value(),
9129 ContentElement::TextBlock(tb) => tb.value(),
9130 ContentElement::TextLine(tl) => tl.value(),
9131 ContentElement::TextChunk(tc) => tc.value.clone(),
9132 _ => String::new(),
9133 };
9134 if !text.is_empty() && !part.is_empty() {
9135 text.push(' ');
9136 }
9137 text.push_str(&part);
9138 }
9139 text
9140}
9141
9142fn has_internal_header_gap(row: &[String]) -> bool {
9143 let mut seen_filled = false;
9144 let mut seen_gap_after_fill = false;
9145 for cell in row {
9146 if cell.trim().is_empty() {
9147 if seen_filled {
9148 seen_gap_after_fill = true;
9149 }
9150 continue;
9151 }
9152 if seen_gap_after_fill {
9153 return true;
9154 }
9155 seen_filled = true;
9156 }
9157 false
9158}
9159
9160fn expand_grouped_header_row(parent: &[String], child: &[String]) -> Vec<String> {
9161 let anchor_cols: Vec<usize> = parent
9162 .iter()
9163 .enumerate()
9164 .filter_map(|(idx, cell)| (!cell.trim().is_empty()).then_some(idx))
9165 .collect();
9166 if anchor_cols.is_empty() {
9167 return parent.to_vec();
9168 }
9169
9170 let mut expanded = parent.to_vec();
9171 for (col_idx, child_cell) in child.iter().enumerate() {
9172 if !expanded[col_idx].trim().is_empty() || child_cell.trim().is_empty() {
9173 continue;
9174 }
9175
9176 let mut best_anchor = anchor_cols[0];
9177 let mut best_distance = usize::abs_diff(anchor_cols[0], col_idx);
9178 for &anchor_idx in &anchor_cols[1..] {
9179 let distance = usize::abs_diff(anchor_idx, col_idx);
9180 if distance < best_distance || (distance == best_distance && anchor_idx > best_anchor) {
9181 best_anchor = anchor_idx;
9182 best_distance = distance;
9183 }
9184 }
9185 expanded[col_idx] = parent[best_anchor].trim().to_string();
9186 }
9187
9188 expanded
9189}
9190
9191fn preserve_grouped_header_rows(rows: &mut [Vec<String>]) -> bool {
9192 if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
9193 return false;
9194 }
9195 if rows[0].first().is_none_or(|cell| cell.trim().is_empty()) {
9196 return false;
9197 }
9198 if rows[1].first().is_some_and(|cell| !cell.trim().is_empty()) {
9199 return false;
9200 }
9201
9202 let first_filled = rows[0]
9203 .iter()
9204 .filter(|cell| !cell.trim().is_empty())
9205 .count();
9206 let second_filled = rows[1]
9207 .iter()
9208 .filter(|cell| !cell.trim().is_empty())
9209 .count();
9210 if first_filled < 2 || second_filled <= first_filled || !has_internal_header_gap(&rows[0]) {
9211 return false;
9212 }
9213
9214 rows[0] = expand_grouped_header_row(&rows[0], &rows[1]);
9215 true
9216}
9217
9218fn merge_continuation_rows(rows: &mut Vec<Vec<String>>) {
9229 if rows.len() < 2 {
9230 return;
9231 }
9232 if preserve_grouped_header_rows(rows) {
9233 return;
9234 }
9235 if rows[0].first().is_none_or(|c| c.trim().is_empty()) {
9237 return;
9238 }
9239
9240 let mut merge_count = 0usize;
9241 for (i, row_i) in rows.iter().enumerate().skip(1) {
9242 let first_empty = row_i.first().is_none_or(|c| c.trim().is_empty());
9243 if !first_empty {
9244 break; }
9246 let all_short = row_i
9248 .iter()
9249 .all(|c| c.trim().is_empty() || c.trim().len() <= 30);
9250 if !all_short {
9251 break;
9252 }
9253 merge_count = i;
9254 }
9255
9256 if merge_count == 0 {
9259 return;
9260 }
9261
9262 for i in 1..=merge_count {
9264 let (head, tail) = rows.split_at_mut(i);
9265 let ncols = head[0].len().min(tail[0].len());
9266 for (target, src) in head[0]
9267 .iter_mut()
9268 .take(ncols)
9269 .zip(tail[0].iter().take(ncols))
9270 {
9271 let fragment = src.trim().to_string();
9272 if !fragment.is_empty() {
9273 let target_str = target.trim().to_string();
9274 *target = if target_str.is_empty() {
9275 fragment
9276 } else {
9277 format!("{} {}", target_str, fragment)
9278 };
9279 }
9280 }
9281 }
9282
9283 rows.drain(1..=merge_count);
9285}
9286
9287fn trim_leading_table_carryover_rows(rows: &mut Vec<Vec<String>>) {
9288 while first_body_row_looks_like_carryover(rows) {
9289 rows.remove(1);
9290 }
9291}
9292
9293fn first_body_row_looks_like_carryover(rows: &[Vec<String>]) -> bool {
9294 if rows.len() < 3 {
9295 return false;
9296 }
9297
9298 let key_col_count = infer_leading_key_column_count(&rows[1..]);
9299 if key_col_count == 0 {
9300 return false;
9301 }
9302
9303 let candidate = &rows[1];
9304 if candidate
9305 .iter()
9306 .take(key_col_count)
9307 .any(|cell| !cell.trim().is_empty())
9308 {
9309 return false;
9310 }
9311
9312 let non_empty_cols = candidate
9313 .iter()
9314 .enumerate()
9315 .filter(|(_, cell)| !cell.trim().is_empty())
9316 .map(|(idx, _)| idx)
9317 .collect::<Vec<_>>();
9318 if non_empty_cols.len() != 1 {
9319 return false;
9320 }
9321
9322 let only_col = non_empty_cols[0];
9323 if only_col < key_col_count {
9324 return false;
9325 }
9326
9327 if candidate[only_col].split_whitespace().count() < 4 {
9328 return false;
9329 }
9330
9331 rows[2]
9332 .iter()
9333 .take(key_col_count)
9334 .all(|cell| !cell.trim().is_empty())
9335}
9336
9337fn infer_leading_key_column_count(rows: &[Vec<String>]) -> usize {
9338 if rows.len() < 2 {
9339 return 0;
9340 }
9341
9342 let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
9343 let mut key_cols = 0usize;
9344
9345 for col_idx in 0..num_cols {
9346 let mut occupancy = 0usize;
9347 let mut word_counts = Vec::new();
9348
9349 for row in rows {
9350 let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
9351 let trimmed = cell.trim();
9352 if trimmed.is_empty() {
9353 continue;
9354 }
9355 occupancy += 1;
9356 word_counts.push(trimmed.split_whitespace().count());
9357 }
9358
9359 if occupancy == 0 {
9360 break;
9361 }
9362
9363 word_counts.sort_unstable();
9364 let median_words = word_counts[word_counts.len() / 2];
9365 let occupancy_ratio = occupancy as f64 / rows.len() as f64;
9366 if occupancy_ratio < 0.6 || median_words > 3 {
9367 break;
9368 }
9369 key_cols += 1;
9370 }
9371
9372 key_cols
9373}
9374
9375fn render_table(out: &mut String, table: &crate::models::semantic::SemanticTable) {
9377 render_table_border(out, &table.table_border);
9379}
9380
9381#[derive(Clone, Debug)]
9382struct GeometricTableRegion {
9383 start_idx: usize,
9384 end_idx: usize,
9385 rendered: String,
9386}
9387
9388#[derive(Clone)]
9389struct ChunkLine {
9390 bbox: BoundingBox,
9391 chunks: Vec<TextChunk>,
9392}
9393
9394#[derive(Clone)]
9395struct SlotFragment {
9396 slot_idx: usize,
9397 bbox: BoundingBox,
9398 text: String,
9399}
9400
9401fn detect_geometric_table_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
9402 let mut regions = Vec::new();
9403 let mut occupied_until = 0usize;
9404
9405 for (idx, element) in doc.kids.iter().enumerate() {
9406 if idx < occupied_until {
9407 continue;
9408 }
9409
9410 let Some(table) = table_border_from_element(element) else {
9411 continue;
9412 };
9413 let Some(region) = build_geometric_table_region(doc, idx, table) else {
9414 continue;
9415 };
9416 occupied_until = region.end_idx.saturating_add(1);
9417 regions.push(region);
9418 }
9419
9420 let mut occupied = regions
9421 .iter()
9422 .flat_map(|region| region.start_idx..=region.end_idx)
9423 .collect::<HashSet<_>>();
9424 for region in detect_footnote_citation_regions(doc) {
9425 if (region.start_idx..=region.end_idx).any(|idx| occupied.contains(&idx)) {
9426 continue;
9427 }
9428 occupied.extend(region.start_idx..=region.end_idx);
9429 regions.push(region);
9430 }
9431
9432 regions.sort_by_key(|region| region.start_idx);
9433 regions
9434}
9435
9436fn detect_footnote_citation_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
9437 let body_font_size = compute_running_body_font_size(doc);
9438 if body_font_size <= 0.0 {
9439 return Vec::new();
9440 }
9441
9442 let mut regions = Vec::new();
9443 let mut idx = 0usize;
9444 while idx < doc.kids.len() {
9445 let Some(region) = build_footnote_citation_region(doc, idx, body_font_size) else {
9446 idx += 1;
9447 continue;
9448 };
9449 idx = region.end_idx.saturating_add(1);
9450 regions.push(region);
9451 }
9452
9453 regions
9454}
9455
9456fn compute_running_body_font_size(doc: &PdfDocument) -> f64 {
9457 doc.kids
9458 .iter()
9459 .filter_map(|element| {
9460 let ContentElement::Paragraph(paragraph) = element else {
9461 return None;
9462 };
9463 let text = paragraph.base.value();
9464 (text.split_whitespace().count() > 10).then_some(paragraph.base.font_size?)
9465 })
9466 .fold(0.0_f64, f64::max)
9467}
9468
9469fn build_footnote_citation_region(
9470 doc: &PdfDocument,
9471 start_idx: usize,
9472 body_font_size: f64,
9473) -> Option<GeometricTableRegion> {
9474 let element = doc.kids.get(start_idx)?;
9475 if !is_geometric_text_candidate(element) {
9476 return None;
9477 }
9478
9479 let start_text = extract_element_text(element);
9480 let trimmed_start = start_text.trim();
9481 if trimmed_start.is_empty() {
9482 return None;
9483 }
9484
9485 let small_font_threshold = (body_font_size * 0.92).min(body_font_size - 0.8).max(0.0);
9486 let mut lead_prefix = None;
9487 let mut fragments = Vec::new();
9488 let page_number = element.page_number()?;
9489 let mut column_bbox = element.bbox().clone();
9490 let mut region_start_idx = start_idx;
9491 let mut end_idx = start_idx;
9492
9493 if element_font_size(element).is_some_and(|font_size| font_size <= small_font_threshold)
9494 && starts_with_footnote_marker(trimmed_start)
9495 {
9496 if let Some((attach_idx, prefix, leading_fragments)) = leading_footnote_attachment(
9497 doc,
9498 start_idx,
9499 page_number,
9500 &column_bbox,
9501 small_font_threshold,
9502 ) {
9503 lead_prefix = Some(prefix);
9504 fragments.extend(leading_fragments);
9505 region_start_idx = attach_idx;
9506 }
9507 fragments.push(footnote_fragment_text(element));
9508 } else {
9509 let (prefix, first_tail) = split_trailing_footnote_lead(trimmed_start)?;
9510 let next = doc.kids.get(start_idx + 1)?;
9511 if !is_geometric_text_candidate(next)
9512 || next.page_number() != Some(page_number)
9513 || !element_font_size(next).is_some_and(|font_size| font_size <= small_font_threshold)
9514 {
9515 return None;
9516 }
9517 if !same_column_region(&column_bbox, next.bbox()) {
9518 return None;
9519 }
9520 lead_prefix = Some(prefix);
9521 fragments.push(first_tail);
9522 }
9523
9524 let mut consecutive_small = 0usize;
9525 for idx in start_idx + 1..doc.kids.len() {
9526 let candidate = &doc.kids[idx];
9527 if !is_geometric_text_candidate(candidate) || candidate.page_number() != Some(page_number) {
9528 break;
9529 }
9530
9531 let candidate_text = extract_element_text(candidate);
9532 let trimmed = candidate_text.trim();
9533 if trimmed.is_empty() || starts_with_caption_prefix(trimmed) {
9534 break;
9535 }
9536
9537 let Some(font_size) = element_font_size(candidate) else {
9538 break;
9539 };
9540 if font_size > small_font_threshold {
9541 break;
9542 }
9543 if !same_column_region(&column_bbox, candidate.bbox()) {
9544 break;
9545 }
9546
9547 column_bbox = column_bbox.union(candidate.bbox());
9548 fragments.push(footnote_fragment_text(candidate));
9549 consecutive_small += 1;
9550 end_idx = idx;
9551 }
9552
9553 if consecutive_small == 0 && lead_prefix.is_some() {
9554 return None;
9555 }
9556
9557 let rows = parse_footnote_citation_rows(&fragments);
9558 if rows.len() < 3 {
9559 return None;
9560 }
9561
9562 let numeric_markers = rows
9563 .iter()
9564 .filter_map(|(marker, _)| marker.parse::<u32>().ok())
9565 .collect::<Vec<_>>();
9566 if numeric_markers.len() != rows.len() {
9567 return None;
9568 }
9569 let sequential_steps = numeric_markers
9570 .windows(2)
9571 .filter(|pair| pair[1] == pair[0] + 1)
9572 .count();
9573 if sequential_steps + 1 < rows.len().saturating_sub(1) {
9574 return None;
9575 }
9576
9577 let mut rendered_rows = vec![vec!["Footnote".to_string(), "Citation".to_string()]];
9578 rendered_rows.extend(
9579 rows.into_iter()
9580 .map(|(marker, citation)| vec![marker, citation]),
9581 );
9582
9583 let mut rendered = String::new();
9584 if let Some(prefix) = lead_prefix {
9585 rendered.push_str(&escape_md_line_start(prefix.trim()));
9586 rendered.push_str("\n\n");
9587 }
9588 rendered.push_str(&render_html_table(&rendered_rows));
9589
9590 Some(GeometricTableRegion {
9591 start_idx: region_start_idx,
9592 end_idx,
9593 rendered,
9594 })
9595}
9596
9597fn leading_footnote_attachment(
9598 doc: &PdfDocument,
9599 start_idx: usize,
9600 page_number: u32,
9601 column_bbox: &BoundingBox,
9602 small_font_threshold: f64,
9603) -> Option<(usize, String, Vec<String>)> {
9604 let mut idx = start_idx.checked_sub(1)?;
9605 let mut leading_fragments = Vec::new();
9606 let mut scanned = 0usize;
9607
9608 loop {
9609 let candidate = doc.kids.get(idx)?;
9610 scanned += 1;
9611 if scanned > 6 || candidate.page_number() != Some(page_number) {
9612 return None;
9613 }
9614
9615 if !is_geometric_text_candidate(candidate) {
9616 if idx == 0 {
9617 return None;
9618 }
9619 idx -= 1;
9620 continue;
9621 }
9622
9623 let text = extract_element_text(candidate);
9624 let trimmed = text.trim();
9625 if trimmed.is_empty() {
9626 if idx == 0 {
9627 return None;
9628 }
9629 idx -= 1;
9630 continue;
9631 }
9632 if !same_column_region(candidate.bbox(), column_bbox) {
9633 return None;
9634 }
9635
9636 if element_font_size(candidate).is_some_and(|font_size| font_size <= small_font_threshold) {
9637 leading_fragments.push(footnote_fragment_text(candidate));
9638 if idx == 0 {
9639 return None;
9640 }
9641 idx -= 1;
9642 continue;
9643 }
9644
9645 let (prefix, first_tail) = split_trailing_footnote_lead(trimmed)?;
9646 leading_fragments.push(first_tail);
9647 leading_fragments.reverse();
9648 return Some((idx, prefix, leading_fragments));
9649 }
9650}
9651
9652fn parse_footnote_citation_rows(fragments: &[String]) -> Vec<(String, String)> {
9653 let mut rows = Vec::new();
9654 let mut current_marker = None::<String>;
9655 let mut current_citation = String::new();
9656
9657 for fragment in fragments {
9658 let markers = find_footnote_marker_positions(fragment);
9659 if markers.is_empty() {
9660 if current_marker.is_some() {
9661 merge_paragraph_text(&mut current_citation, fragment.trim());
9662 }
9663 continue;
9664 }
9665
9666 let mut cursor = 0usize;
9667 for (pos, marker, skip_len) in markers {
9668 let prefix = fragment[cursor..pos].trim();
9669 if current_marker.is_some() && !prefix.is_empty() {
9670 merge_paragraph_text(&mut current_citation, prefix);
9671 }
9672 if let Some(marker_value) = current_marker.take() {
9673 let trimmed = current_citation.trim();
9674 if !trimmed.is_empty() {
9675 rows.push((marker_value, trimmed.to_string()));
9676 }
9677 current_citation.clear();
9678 }
9679 current_marker = Some(marker);
9680 cursor = pos + skip_len;
9681 }
9682
9683 let tail = fragment[cursor..].trim();
9684 if current_marker.is_some() && !tail.is_empty() {
9685 merge_paragraph_text(&mut current_citation, tail);
9686 }
9687 }
9688
9689 if let Some(marker_value) = current_marker {
9690 let trimmed = current_citation.trim();
9691 if !trimmed.is_empty() {
9692 rows.push((marker_value, trimmed.to_string()));
9693 }
9694 }
9695
9696 rebalance_adjacent_footnote_citations(&mut rows);
9697 rows
9698}
9699
9700fn rebalance_adjacent_footnote_citations(rows: &mut [(String, String)]) {
9701 for idx in 0..rows.len().saturating_sub(1) {
9702 if !rows[idx].1.trim_end().ends_with(',') {
9703 continue;
9704 }
9705
9706 let next = rows[idx + 1].1.trim().to_string();
9707 let Some((stub, remainder)) = split_leading_citation_stub(&next) else {
9708 continue;
9709 };
9710 let Some((first_sentence, trailing)) = split_first_sentence(remainder) else {
9711 continue;
9712 };
9713 if first_sentence.split_whitespace().count() < 2 {
9714 continue;
9715 }
9716
9717 merge_paragraph_text(&mut rows[idx].1, first_sentence);
9718 rows[idx + 1].1 = if trailing.is_empty() {
9719 stub.to_string()
9720 } else {
9721 format!("{stub} {trailing}")
9722 };
9723 }
9724}
9725
9726fn split_leading_citation_stub(text: &str) -> Option<(&str, &str)> {
9727 let comma_idx = text.find(',')?;
9728 if comma_idx > 8 {
9729 return None;
9730 }
9731 let stub = text[..=comma_idx].trim();
9732 let remainder = text[comma_idx + 1..].trim();
9733 (!stub.is_empty() && !remainder.is_empty()).then_some((stub, remainder))
9734}
9735
9736fn split_first_sentence(text: &str) -> Option<(&str, &str)> {
9737 let period_idx = text.find(". ")?;
9738 let first = text[..=period_idx].trim();
9739 let trailing = text[period_idx + 2..].trim();
9740 (!first.is_empty()).then_some((first, trailing))
9741}
9742
9743fn find_footnote_marker_positions(text: &str) -> Vec<(usize, String, usize)> {
9744 let chars = text.char_indices().collect::<Vec<_>>();
9745 let mut markers = Vec::new();
9746 let mut idx = 0usize;
9747
9748 while idx < chars.len() {
9749 let (byte_idx, ch) = chars[idx];
9750 if !ch.is_ascii_digit() {
9751 idx += 1;
9752 continue;
9753 }
9754
9755 let at_boundary = idx == 0
9756 || chars[idx - 1].1.is_whitespace()
9757 || matches!(
9758 chars[idx - 1].1,
9759 '.' | ',' | ';' | ':' | ')' | ']' | '"' | '\'' | '”'
9760 );
9761 if !at_boundary {
9762 idx += 1;
9763 continue;
9764 }
9765
9766 let mut end_idx = idx;
9767 while end_idx < chars.len() && chars[end_idx].1.is_ascii_digit() {
9768 end_idx += 1;
9769 }
9770 let digits = &text[byte_idx
9771 ..chars
9772 .get(end_idx)
9773 .map(|(pos, _)| *pos)
9774 .unwrap_or(text.len())];
9775 if digits.len() > 2 || end_idx >= chars.len() || !chars[end_idx].1.is_whitespace() {
9776 idx += 1;
9777 continue;
9778 }
9779
9780 let mut lookahead = end_idx;
9781 while lookahead < chars.len() && chars[lookahead].1.is_whitespace() {
9782 lookahead += 1;
9783 }
9784 let Some((_, next_ch)) = chars.get(lookahead) else {
9785 idx += 1;
9786 continue;
9787 };
9788 if !(next_ch.is_ascii_uppercase() || matches!(*next_ch, '(' | '[' | '*')) {
9789 idx += 1;
9790 continue;
9791 }
9792
9793 let skip_end = chars
9794 .get(lookahead)
9795 .map(|(pos, _)| *pos)
9796 .unwrap_or(text.len());
9797 markers.push((byte_idx, digits.to_string(), skip_end - byte_idx));
9798 idx = lookahead;
9799 }
9800
9801 markers
9802}
9803
9804fn split_trailing_footnote_lead(text: &str) -> Option<(String, String)> {
9805 let markers = find_footnote_marker_positions(text);
9806 let (pos, marker, skip_len) = markers.last()?.clone();
9807 let prefix = text[..pos].trim();
9808 let tail = text[pos + skip_len..].trim();
9809 if prefix.split_whitespace().count() < 6 || tail.split_whitespace().count() > 6 {
9810 return None;
9811 }
9812 Some((prefix.to_string(), format!("{marker} {tail}")))
9813}
9814
9815fn starts_with_footnote_marker(text: &str) -> bool {
9816 find_footnote_marker_positions(text)
9817 .first()
9818 .is_some_and(|(pos, _, _)| *pos == 0)
9819}
9820
9821fn same_column_region(left: &BoundingBox, right: &BoundingBox) -> bool {
9822 let overlap = (left.right_x.min(right.right_x) - left.left_x.max(right.left_x)).max(0.0);
9823 let min_width = left.width().min(right.width()).max(1.0);
9824 overlap / min_width >= 0.35 || (left.left_x - right.left_x).abs() <= 28.0
9825}
9826
9827fn footnote_fragment_text(element: &ContentElement) -> String {
9828 let text = extract_element_text(element);
9829 if element_font_name(element)
9830 .as_deref()
9831 .is_some_and(|name| name.to_ascii_lowercase().contains("italic"))
9832 {
9833 format!("*{}*", text.trim())
9834 } else {
9835 text
9836 }
9837}
9838
9839fn element_font_size(element: &ContentElement) -> Option<f64> {
9840 match element {
9841 ContentElement::Paragraph(p) => p.base.font_size,
9842 ContentElement::Heading(h) => h.base.base.font_size,
9843 ContentElement::NumberHeading(nh) => nh.base.base.base.font_size,
9844 ContentElement::TextBlock(tb) => Some(tb.font_size),
9845 ContentElement::TextLine(tl) => Some(tl.font_size),
9846 _ => None,
9847 }
9848}
9849
9850fn element_font_name(element: &ContentElement) -> Option<String> {
9851 match element {
9852 ContentElement::Paragraph(p) => p.base.font_name.clone(),
9853 ContentElement::Heading(h) => h.base.base.font_name.clone(),
9854 ContentElement::NumberHeading(nh) => nh.base.base.base.font_name.clone(),
9855 _ => None,
9856 }
9857}
9858
9859fn table_border_from_element(
9860 element: &ContentElement,
9861) -> Option<&crate::models::table::TableBorder> {
9862 match element {
9863 ContentElement::TableBorder(table) => Some(table),
9864 ContentElement::Table(table) => Some(&table.table_border),
9865 _ => None,
9866 }
9867}
9868
9869fn build_geometric_table_region(
9870 doc: &PdfDocument,
9871 table_idx: usize,
9872 table: &crate::models::table::TableBorder,
9873) -> Option<GeometricTableRegion> {
9874 let mut table_rows = collect_table_border_rows(table);
9875 if table_rows.is_empty() || table.num_columns < 3 {
9876 return None;
9877 }
9878 merge_continuation_rows(&mut table_rows);
9879
9880 let column_ranges = table_column_ranges(table)?;
9881 let candidate_indices = collect_table_header_candidate_indices(doc, table_idx, table);
9882 if candidate_indices.is_empty() {
9883 return None;
9884 }
9885
9886 let needs_external_stub =
9887 infer_left_stub_requirement(doc, &candidate_indices, &table_rows, &column_ranges);
9888 let supports_embedded_stub_header =
9889 supports_embedded_stub_header(&table_rows, &column_ranges, doc, &candidate_indices);
9890 if !needs_external_stub && !supports_embedded_stub_header {
9891 return None;
9892 }
9893 let slot_ranges = if needs_external_stub {
9894 slot_ranges(&column_ranges, doc, &candidate_indices, true)?
9895 } else {
9896 column_ranges.clone()
9897 };
9898 let mut header_rows = reconstruct_aligned_rows(doc, &candidate_indices, &slot_ranges, true, 2);
9899 if header_rows.is_empty() {
9900 return None;
9901 }
9902 if needs_external_stub {
9903 normalize_leading_stub_header(&mut header_rows);
9904 } else {
9905 promote_embedded_stub_header(&mut header_rows, &table_rows);
9906 }
9907
9908 let slot_count = slot_ranges.len();
9909 let dense_header_rows = header_rows
9910 .iter()
9911 .filter(|row| {
9912 row.iter().filter(|cell| !cell.trim().is_empty()).count()
9913 >= slot_count.saturating_sub(1).max(2)
9914 })
9915 .count();
9916 if dense_header_rows == 0 {
9917 return None;
9918 }
9919
9920 let mut combined_rows = Vec::new();
9921 combined_rows.extend(header_rows);
9922
9923 let following_indices = collect_table_footer_candidate_indices(doc, table_idx, table);
9924 let body_rows = if needs_external_stub && should_merge_panel_body_rows(&table_rows) {
9925 let trailing_rows =
9926 reconstruct_aligned_rows(doc, &following_indices, &slot_ranges, false, 1);
9927 vec![merge_panel_body_row(
9928 &table_rows,
9929 &trailing_rows,
9930 slot_count,
9931 )]
9932 } else if needs_external_stub {
9933 table_rows
9934 .iter()
9935 .map(|row| {
9936 let mut shifted = vec![String::new()];
9937 shifted.extend(row.iter().cloned());
9938 shifted
9939 })
9940 .collect()
9941 } else {
9942 table_rows
9943 };
9944
9945 if body_rows.is_empty() {
9946 return None;
9947 }
9948 combined_rows.extend(body_rows);
9949
9950 let rendered = render_pipe_rows(&combined_rows);
9951 Some(GeometricTableRegion {
9952 start_idx: candidate_indices[0],
9953 end_idx: following_indices.last().copied().unwrap_or(table_idx),
9954 rendered,
9955 })
9956}
9957
9958fn table_column_ranges(table: &crate::models::table::TableBorder) -> Option<Vec<(f64, f64)>> {
9959 if table.num_columns == 0 {
9960 return None;
9961 }
9962
9963 let mut ranges = vec![(f64::INFINITY, f64::NEG_INFINITY); table.num_columns];
9964 for row in &table.rows {
9965 for cell in &row.cells {
9966 if cell.col_number >= table.num_columns {
9967 continue;
9968 }
9969 let range = &mut ranges[cell.col_number];
9970 range.0 = range.0.min(cell.bbox.left_x);
9971 range.1 = range.1.max(cell.bbox.right_x);
9972 }
9973 }
9974
9975 if ranges
9976 .iter()
9977 .any(|(left, right)| !left.is_finite() || !right.is_finite() || right <= left)
9978 {
9979 return None;
9980 }
9981
9982 Some(ranges)
9983}
9984
9985fn collect_table_header_candidate_indices(
9986 doc: &PdfDocument,
9987 table_idx: usize,
9988 table: &crate::models::table::TableBorder,
9989) -> Vec<usize> {
9990 let mut indices = Vec::new();
9991 let table_page = table.bbox.page_number;
9992 let table_top = table.bbox.top_y;
9993 let mut cursor = table_idx;
9994
9995 while let Some(prev_idx) = cursor.checked_sub(1) {
9996 let element = &doc.kids[prev_idx];
9997 if element.page_number() != table_page {
9998 break;
9999 }
10000 if !is_geometric_text_candidate(element) {
10001 break;
10002 }
10003
10004 let bbox = element.bbox();
10005 let vertical_gap = bbox.bottom_y - table_top;
10006 if !(-6.0..=260.0).contains(&vertical_gap) {
10007 break;
10008 }
10009
10010 indices.push(prev_idx);
10011 cursor = prev_idx;
10012 if indices.len() >= 10 {
10013 break;
10014 }
10015 }
10016
10017 indices.reverse();
10018 indices
10019}
10020
10021fn collect_table_footer_candidate_indices(
10022 doc: &PdfDocument,
10023 table_idx: usize,
10024 table: &crate::models::table::TableBorder,
10025) -> Vec<usize> {
10026 let mut indices = Vec::new();
10027 let table_page = table.bbox.page_number;
10028 let table_bottom = table.bbox.bottom_y;
10029
10030 for idx in table_idx + 1..doc.kids.len() {
10031 let element = &doc.kids[idx];
10032 if element.page_number() != table_page {
10033 break;
10034 }
10035 if !is_geometric_text_candidate(element) {
10036 break;
10037 }
10038 if looks_like_margin_page_number(doc, element, &extract_element_text(element)) {
10039 break;
10040 }
10041
10042 let bbox = element.bbox();
10043 let gap = table_bottom - bbox.top_y;
10044 if !(-6.0..=28.0).contains(&gap) {
10045 break;
10046 }
10047 indices.push(idx);
10048 if indices.len() >= 4 {
10049 break;
10050 }
10051 }
10052
10053 indices
10054}
10055
10056fn is_geometric_text_candidate(element: &ContentElement) -> bool {
10057 matches!(
10058 element,
10059 ContentElement::Paragraph(_)
10060 | ContentElement::Heading(_)
10061 | ContentElement::NumberHeading(_)
10062 | ContentElement::TextBlock(_)
10063 | ContentElement::TextLine(_)
10064 )
10065}
10066
10067fn infer_left_stub_requirement(
10068 doc: &PdfDocument,
10069 candidate_indices: &[usize],
10070 table_rows: &[Vec<String>],
10071 column_ranges: &[(f64, f64)],
10072) -> bool {
10073 if column_ranges.is_empty() {
10074 return false;
10075 }
10076
10077 let first_width = (column_ranges[0].1 - column_ranges[0].0).max(1.0);
10078 let has_left_label = candidate_indices.iter().any(|idx| {
10079 let bbox = doc.kids[*idx].bbox();
10080 bbox.right_x <= column_ranges[0].0 + first_width * 0.12
10081 && bbox.width() <= first_width * 0.45
10082 });
10083 if !has_left_label {
10084 return false;
10085 }
10086
10087 let mut first_col_word_counts: Vec<usize> = table_rows
10088 .iter()
10089 .filter_map(|row| row.first())
10090 .map(|cell| cell.split_whitespace().count())
10091 .collect();
10092 if first_col_word_counts.is_empty() {
10093 return false;
10094 }
10095 first_col_word_counts.sort_unstable();
10096 let median = first_col_word_counts[first_col_word_counts.len() / 2];
10097 median >= 5
10098}
10099
10100fn supports_embedded_stub_header(
10101 table_rows: &[Vec<String>],
10102 column_ranges: &[(f64, f64)],
10103 doc: &PdfDocument,
10104 candidate_indices: &[usize],
10105) -> bool {
10106 if table_rows.len() < 2 || column_ranges.len() < 3 {
10107 return false;
10108 }
10109
10110 let first_row = &table_rows[0];
10111 if first_row.len() != column_ranges.len() || first_row[0].trim().is_empty() {
10112 return false;
10113 }
10114 if first_row[0].split_whitespace().count() > 3 || first_row[0].trim().len() > 24 {
10115 return false;
10116 }
10117
10118 let data_fill = first_row
10119 .iter()
10120 .skip(1)
10121 .filter(|cell| !cell.trim().is_empty())
10122 .count();
10123 if data_fill + 1 < column_ranges.len() {
10124 return false;
10125 }
10126
10127 let labeled_rows = table_rows
10128 .iter()
10129 .skip(1)
10130 .filter(|row| row.first().is_some_and(|cell| !cell.trim().is_empty()))
10131 .count();
10132 if labeled_rows == 0 {
10133 return false;
10134 }
10135
10136 let slot_ranges = column_ranges.to_vec();
10137 let header_rows = reconstruct_aligned_rows(doc, candidate_indices, &slot_ranges, true, 2);
10138 header_rows.iter().any(|row| {
10139 row.first().is_none_or(|cell| cell.trim().is_empty())
10140 && row
10141 .iter()
10142 .skip(1)
10143 .filter(|cell| !cell.trim().is_empty())
10144 .count()
10145 >= column_ranges.len().saturating_sub(1)
10146 })
10147}
10148
10149fn slot_ranges(
10150 column_ranges: &[(f64, f64)],
10151 doc: &PdfDocument,
10152 candidate_indices: &[usize],
10153 needs_stub: bool,
10154) -> Option<Vec<(f64, f64)>> {
10155 let mut slots = Vec::new();
10156 if needs_stub {
10157 let first_left = column_ranges.first()?.0;
10158 let left_stub_start = candidate_indices
10159 .iter()
10160 .map(|idx| doc.kids[*idx].bbox().left_x)
10161 .fold(first_left, f64::min);
10162 let stub_right = first_left - 1.0;
10163 if stub_right <= left_stub_start {
10164 return None;
10165 }
10166 slots.push((left_stub_start, stub_right));
10167 }
10168 slots.extend(column_ranges.iter().copied());
10169 Some(slots)
10170}
10171
10172fn reconstruct_aligned_rows(
10173 doc: &PdfDocument,
10174 candidate_indices: &[usize],
10175 slot_ranges: &[(f64, f64)],
10176 drop_wide_singletons: bool,
10177 min_filled_slots: usize,
10178) -> Vec<Vec<String>> {
10179 if candidate_indices.is_empty() || slot_ranges.is_empty() {
10180 return Vec::new();
10181 }
10182
10183 let mut row_bands: Vec<(BoundingBox, Vec<String>)> = Vec::new();
10184
10185 for idx in candidate_indices {
10186 for line in extract_chunk_lines(&doc.kids[*idx]) {
10187 let fragments = split_line_into_slot_fragments(&line, slot_ranges);
10188 if fragments.is_empty() {
10189 continue;
10190 }
10191
10192 if drop_wide_singletons && fragments.len() == 1 {
10193 let only = &fragments[0];
10194 let span_width = only.bbox.width();
10195 let table_width =
10196 slot_ranges.last().map(|(_, right)| *right).unwrap_or(0.0) - slot_ranges[0].0;
10197 if span_width >= table_width * 0.55 {
10198 continue;
10199 }
10200 }
10201
10202 let line_center = line.bbox.center_y();
10203 let tolerance = line
10204 .chunks
10205 .iter()
10206 .map(|chunk| chunk.font_size)
10207 .fold(8.0, f64::max)
10208 * 0.8;
10209
10210 let mut target_row = None;
10211 for (row_idx, (bbox, _)) in row_bands.iter().enumerate() {
10212 if (bbox.center_y() - line_center).abs() <= tolerance {
10213 target_row = Some(row_idx);
10214 break;
10215 }
10216 }
10217
10218 if let Some(row_idx) = target_row {
10219 let (bbox, cells) = &mut row_bands[row_idx];
10220 *bbox = bbox.union(&line.bbox);
10221 for fragment in fragments {
10222 append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
10223 }
10224 } else {
10225 let mut cells = vec![String::new(); slot_ranges.len()];
10226 for fragment in fragments {
10227 append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
10228 }
10229 row_bands.push((line.bbox.clone(), cells));
10230 }
10231 }
10232 }
10233
10234 row_bands.sort_by(|left, right| {
10235 right
10236 .0
10237 .top_y
10238 .partial_cmp(&left.0.top_y)
10239 .unwrap_or(std::cmp::Ordering::Equal)
10240 });
10241
10242 row_bands
10243 .into_iter()
10244 .map(|(_, cells)| cells)
10245 .filter(|cells| {
10246 let filled = cells.iter().filter(|cell| !cell.trim().is_empty()).count();
10247 filled >= min_filled_slots
10248 })
10249 .collect()
10250}
10251
10252fn extract_chunk_lines(element: &ContentElement) -> Vec<ChunkLine> {
10253 match element {
10254 ContentElement::Paragraph(p) => chunk_lines_from_semantic_node(&p.base),
10255 ContentElement::Heading(h) => chunk_lines_from_semantic_node(&h.base.base),
10256 ContentElement::NumberHeading(nh) => chunk_lines_from_semantic_node(&nh.base.base.base),
10257 ContentElement::TextBlock(tb) => tb
10258 .text_lines
10259 .iter()
10260 .map(|line| ChunkLine {
10261 bbox: line.bbox.clone(),
10262 chunks: line.text_chunks.clone(),
10263 })
10264 .collect(),
10265 ContentElement::TextLine(tl) => vec![ChunkLine {
10266 bbox: tl.bbox.clone(),
10267 chunks: tl.text_chunks.clone(),
10268 }],
10269 _ => Vec::new(),
10270 }
10271}
10272
10273fn chunk_lines_from_semantic_node(node: &SemanticTextNode) -> Vec<ChunkLine> {
10274 let mut lines = Vec::new();
10275 for column in &node.columns {
10276 for block in &column.text_blocks {
10277 for line in &block.text_lines {
10278 lines.push(ChunkLine {
10279 bbox: line.bbox.clone(),
10280 chunks: line.text_chunks.clone(),
10281 });
10282 }
10283 }
10284 }
10285 lines
10286}
10287
10288fn split_line_into_slot_fragments(
10289 line: &ChunkLine,
10290 slot_ranges: &[(f64, f64)],
10291) -> Vec<SlotFragment> {
10292 let mut groups: Vec<(usize, Vec<TextChunk>, BoundingBox)> = Vec::new();
10293
10294 for chunk in line
10295 .chunks
10296 .iter()
10297 .filter(|chunk| !chunk.value.trim().is_empty())
10298 .cloned()
10299 {
10300 let slot_idx = assign_chunk_to_slot(&chunk.bbox, slot_ranges);
10301 if let Some((prev_slot, prev_chunks, prev_bbox)) = groups.last_mut() {
10302 let gap = chunk.bbox.left_x - prev_bbox.right_x;
10303 if *prev_slot == slot_idx && gap <= chunk.font_size.max(6.0) * 2.4 {
10304 *prev_bbox = prev_bbox.union(&chunk.bbox);
10305 prev_chunks.push(chunk);
10306 continue;
10307 }
10308 }
10309 groups.push((slot_idx, vec![chunk.clone()], chunk.bbox.clone()));
10310 }
10311
10312 groups
10313 .into_iter()
10314 .filter_map(|(slot_idx, chunks, bbox)| {
10315 let text = normalize_common_ocr_text(
10316 &crate::models::text::TextLine::concatenate_chunks(&chunks),
10317 );
10318 if text.trim().is_empty() {
10319 None
10320 } else {
10321 Some(SlotFragment {
10322 slot_idx,
10323 bbox,
10324 text,
10325 })
10326 }
10327 })
10328 .collect()
10329}
10330
10331fn assign_chunk_to_slot(bbox: &BoundingBox, slot_ranges: &[(f64, f64)]) -> usize {
10332 let mut best_idx = 0usize;
10333 let mut best_overlap = f64::NEG_INFINITY;
10334 let center_x = bbox.center_x();
10335
10336 for (idx, (left, right)) in slot_ranges.iter().enumerate() {
10337 let overlap = (bbox.right_x.min(*right) - bbox.left_x.max(*left)).max(0.0);
10338 let score = if overlap > 0.0 {
10339 overlap / bbox.width().max(1.0)
10340 } else {
10341 -((center_x - ((*left + *right) / 2.0)).abs())
10342 };
10343 if score > best_overlap {
10344 best_overlap = score;
10345 best_idx = idx;
10346 }
10347 }
10348
10349 best_idx
10350}
10351
10352fn append_cell_text(cell: &mut String, fragment: &str) {
10353 let trimmed = fragment.trim();
10354 if trimmed.is_empty() {
10355 return;
10356 }
10357 if !cell.is_empty() {
10358 cell.push(' ');
10359 }
10360 cell.push_str(trimmed);
10361}
10362
10363fn normalize_leading_stub_header(rows: &mut [Vec<String>]) {
10364 if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
10365 return;
10366 }
10367
10368 if !rows[0][0].trim().is_empty() || rows[1][0].trim().is_empty() {
10369 return;
10370 }
10371
10372 let first_row_filled = rows[0]
10373 .iter()
10374 .skip(1)
10375 .filter(|cell| !cell.trim().is_empty())
10376 .count();
10377 let second_row_filled = rows[1]
10378 .iter()
10379 .skip(1)
10380 .filter(|cell| !cell.trim().is_empty())
10381 .count();
10382 if first_row_filled < 2 || second_row_filled < 2 {
10383 return;
10384 }
10385
10386 rows[0][0] = rows[1][0].trim().to_string();
10387 rows[1][0].clear();
10388}
10389
10390fn promote_embedded_stub_header(header_rows: &mut [Vec<String>], table_rows: &[Vec<String>]) {
10391 let Some(header_row) = header_rows.first_mut() else {
10392 return;
10393 };
10394 let Some(first_body_row) = table_rows.first() else {
10395 return;
10396 };
10397 if header_row.is_empty() || first_body_row.is_empty() {
10398 return;
10399 }
10400 if !header_row[0].trim().is_empty() {
10401 return;
10402 }
10403
10404 let promoted = first_body_row[0].trim();
10405 if promoted.is_empty() || promoted.split_whitespace().count() > 3 || promoted.len() > 24 {
10406 return;
10407 }
10408
10409 let header_fill = header_row
10410 .iter()
10411 .skip(1)
10412 .filter(|cell| !cell.trim().is_empty())
10413 .count();
10414 let body_fill = first_body_row
10415 .iter()
10416 .skip(1)
10417 .filter(|cell| !cell.trim().is_empty())
10418 .count();
10419 if header_fill < header_row.len().saturating_sub(1)
10420 || body_fill < first_body_row.len().saturating_sub(1)
10421 {
10422 return;
10423 }
10424
10425 header_row[0] = promoted.to_string();
10426}
10427
10428fn should_merge_panel_body_rows(rows: &[Vec<String>]) -> bool {
10429 rows.len() >= 3
10430 && rows
10431 .iter()
10432 .all(|row| !row.is_empty() && row.iter().all(|cell| !cell.trim().is_empty()))
10433}
10434
10435fn merge_panel_body_row(
10436 table_rows: &[Vec<String>],
10437 trailing_rows: &[Vec<String>],
10438 slot_count: usize,
10439) -> Vec<String> {
10440 let mut merged = vec![String::new(); slot_count];
10441 for row in table_rows {
10442 for (col_idx, cell) in row.iter().enumerate() {
10443 if col_idx + 1 >= slot_count {
10444 break;
10445 }
10446 append_cell_text(&mut merged[col_idx + 1], cell);
10447 }
10448 }
10449 for row in trailing_rows {
10450 for (col_idx, cell) in row.iter().enumerate() {
10451 if col_idx >= slot_count {
10452 break;
10453 }
10454 append_cell_text(&mut merged[col_idx], cell);
10455 }
10456 }
10457 merged
10458}
10459
10460fn render_pipe_rows(rows: &[Vec<String>]) -> String {
10461 if rows.is_empty() {
10462 return String::new();
10463 }
10464
10465 let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
10466 if num_cols == 0 {
10467 return String::new();
10468 }
10469
10470 let mut out = String::new();
10471 for (row_idx, row) in rows.iter().enumerate() {
10472 out.push('|');
10473 for col_idx in 0..num_cols {
10474 let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
10475 out.push_str(&format!(" {} |", cell.trim()));
10476 }
10477 out.push('\n');
10478
10479 if row_idx == 0 {
10480 out.push('|');
10481 for _ in 0..num_cols {
10482 out.push_str(" --- |");
10483 }
10484 out.push('\n');
10485 }
10486 }
10487 out.push('\n');
10488 out
10489}
10490
10491fn render_html_table(rows: &[Vec<String>]) -> String {
10492 if rows.is_empty() {
10493 return String::new();
10494 }
10495
10496 let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
10497 if num_cols == 0 {
10498 return String::new();
10499 }
10500
10501 let mut out = String::from("<table>\n");
10502 for (row_idx, row) in rows.iter().enumerate() {
10503 out.push_str("<tr>");
10504 for col_idx in 0..num_cols {
10505 let cell = escape_html_text(row.get(col_idx).map(String::as_str).unwrap_or("").trim());
10506 if row_idx == 0 {
10507 out.push_str("<th>");
10508 out.push_str(&cell);
10509 out.push_str("</th>");
10510 } else {
10511 out.push_str("<td>");
10512 out.push_str(&cell);
10513 out.push_str("</td>");
10514 }
10515 }
10516 out.push_str("</tr>\n");
10517 }
10518 out.push_str("</table>\n\n");
10519 out
10520}
10521
10522fn escape_html_text(text: &str) -> String {
10523 text.replace('&', "&")
10524 .replace('<', "<")
10525 .replace('>', ">")
10526 .replace('"', """)
10527 .replace('\'', "'")
10528}
10529
10530fn normalized_numeric_marker(text: &str) -> Option<String> {
10531 let digits = text
10532 .chars()
10533 .filter(|ch| ch.is_ascii_digit())
10534 .collect::<String>();
10535 (!digits.is_empty() && digits.len() <= 2).then_some(digits)
10536}
10537
10538fn render_infographic_card_rows(rows: &[Vec<String>]) -> Option<String> {
10539 if rows.is_empty() || !rows.iter().all(|row| row.len() == 2) {
10540 return None;
10541 }
10542
10543 let marker = normalized_numeric_marker(rows[0][0].trim())?;
10544 if rows[0][1].split_whitespace().count() < 4 {
10545 return None;
10546 }
10547 if rows
10548 .iter()
10549 .skip(1)
10550 .any(|row| normalized_numeric_marker(row[0].trim()).is_some())
10551 {
10552 return None;
10553 }
10554 if rows
10555 .iter()
10556 .skip(1)
10557 .any(|row| !row[0].trim().is_empty() && row[0].trim().len() > 2)
10558 {
10559 return None;
10560 }
10561
10562 let body = rows
10563 .iter()
10564 .filter_map(|row| row.get(1))
10565 .map(|cell| cell.trim())
10566 .filter(|cell| !cell.is_empty())
10567 .collect::<Vec<_>>()
10568 .join(" ");
10569 if body.split_whitespace().count() < 8 {
10570 return None;
10571 }
10572
10573 Some(format!("{marker}. {body}\n\n"))
10574}
10575
10576fn extract_element_text(element: &ContentElement) -> String {
10577 match element {
10578 ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
10579 ContentElement::Heading(h) => clean_paragraph_text(&h.base.base.value()),
10580 ContentElement::NumberHeading(nh) => clean_paragraph_text(&nh.base.base.base.value()),
10581 ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
10582 ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
10583 _ => String::new(),
10584 }
10585}
10586
10587fn collect_table_border_rows(table: &crate::models::table::TableBorder) -> Vec<Vec<String>> {
10589 let num_cols = table.num_columns.max(1);
10590 let mut rendered_rows: Vec<Vec<String>> = Vec::new();
10591 for row in &table.rows {
10592 let cell_texts: Vec<String> = (0..num_cols)
10593 .map(|col| {
10594 row.cells
10595 .iter()
10596 .find(|c| c.col_number == col)
10597 .map(cell_text_content)
10598 .unwrap_or_default()
10599 })
10600 .collect();
10601 if !cell_texts.iter().all(|t| t.trim().is_empty()) {
10602 rendered_rows.push(cell_texts);
10603 }
10604 }
10605 rendered_rows
10606}
10607
10608fn render_table_border(out: &mut String, table: &crate::models::table::TableBorder) {
10614 if table.rows.is_empty() {
10615 return;
10616 }
10617
10618 let mut rendered_rows = collect_table_border_rows(table);
10620
10621 if rendered_rows.is_empty() {
10622 return;
10623 }
10624
10625 if let Some(rendered) = render_infographic_card_rows(&rendered_rows) {
10626 out.push_str(&rendered);
10627 return;
10628 }
10629
10630 merge_continuation_rows(&mut rendered_rows);
10632 trim_leading_table_carryover_rows(&mut rendered_rows);
10633
10634 if is_toc_table(&rendered_rows) {
10636 render_toc_rows(out, &rendered_rows);
10637 return;
10638 }
10639
10640 out.push_str(&render_pipe_rows(&rendered_rows));
10641}
10642
10643fn is_page_number_like(text: &str) -> bool {
10645 let t = text.trim();
10646 if t.is_empty() {
10647 return false;
10648 }
10649 if t.len() <= 5 && t.chars().all(|c| c.is_ascii_digit()) {
10651 return true;
10652 }
10653 let lower = t.to_ascii_lowercase();
10655 if lower.len() <= 10 && lower.chars().all(|c| "ivxlcdm".contains(c)) {
10656 return true;
10657 }
10658 false
10659}
10660
10661fn is_toc_table(rows: &[Vec<String>]) -> bool {
10664 if rows.is_empty() {
10665 return false;
10666 }
10667 if rows.len() < 2 {
10669 return false;
10670 }
10671 if !rows.iter().all(|r| r.len() == 2) {
10673 return false;
10674 }
10675
10676 let non_empty_right = rows.iter().filter(|r| !r[1].trim().is_empty()).count();
10677 if non_empty_right < 2 {
10678 return false;
10679 }
10680
10681 let page_like = rows.iter().filter(|r| is_page_number_like(&r[1])).count();
10682 page_like >= 2 && page_like * 10 >= non_empty_right * 9 && page_like * 2 >= rows.len()
10683}
10684
10685fn render_toc_rows(out: &mut String, rows: &[Vec<String>]) {
10687 for row in rows {
10688 let title = row[0].trim();
10689 let page = row[1].trim();
10690 if title.is_empty() && page.is_empty() {
10691 continue;
10692 }
10693 if !title.is_empty() && !page.is_empty() {
10694 out.push_str(title);
10695 out.push(' ');
10696 out.push_str(page);
10697 } else {
10698 out.push_str(title);
10699 out.push_str(page);
10700 }
10701 out.push('\n');
10702 }
10703 out.push('\n');
10704}
10705
10706fn cell_text_content(cell: &crate::models::table::TableBorderCell) -> String {
10708 if !cell.content.is_empty() {
10712 let chunks: Vec<_> = cell.content.iter().map(|t| t.base.clone()).collect();
10713 return normalize_common_ocr_text(&crate::models::text::TextLine::concatenate_chunks(
10714 &chunks,
10715 ));
10716 }
10717 let mut text = String::new();
10719 for elem in &cell.contents {
10720 match elem {
10721 ContentElement::Paragraph(p) => text.push_str(&p.base.value()),
10722 ContentElement::TextBlock(tb) => text.push_str(&tb.value()),
10723 ContentElement::TextLine(tl) => text.push_str(&tl.value()),
10724 ContentElement::TextChunk(tc) => text.push_str(&tc.value),
10725 _ => {}
10726 }
10727 }
10728 normalize_common_ocr_text(&repair_fragmented_words(&text))
10729}
10730
10731fn merge_adjacent_pipe_tables(markdown: &str) -> String {
10739 let lines: Vec<&str> = markdown.lines().collect();
10740 if lines.len() < 4 {
10741 return markdown.to_string();
10742 }
10743
10744 fn count_pipe_cols(line: &str) -> usize {
10745 let t = line.trim();
10746 if !t.starts_with('|') || !t.ends_with('|') {
10747 return 0;
10748 }
10749 t.split('|').count().saturating_sub(2)
10750 }
10751
10752 fn is_separator(line: &str) -> bool {
10753 let t = line.trim();
10754 if !t.starts_with('|') || !t.ends_with('|') {
10755 return false;
10756 }
10757 let cells: Vec<&str> = t.split('|').collect();
10758 if cells.len() < 3 {
10759 return false;
10760 }
10761 cells[1..cells.len() - 1].iter().all(|c| {
10762 let s = c.trim();
10763 !s.is_empty() && s.chars().all(|ch| ch == '-' || ch == ':')
10764 })
10765 }
10766
10767 fn is_pipe_row(line: &str) -> bool {
10768 let t = line.trim();
10769 t.starts_with('|') && t.ends_with('|') && t.len() > 2
10770 }
10771
10772 fn pipe_cells(line: &str) -> Vec<String> {
10773 let t = line.trim();
10774 if !is_pipe_row(t) {
10775 return Vec::new();
10776 }
10777 let parts = t.split('|').collect::<Vec<_>>();
10778 parts[1..parts.len() - 1]
10779 .iter()
10780 .map(|cell| cell.trim().to_string())
10781 .collect()
10782 }
10783
10784 fn normalize_header_cell(cell: &str) -> String {
10785 cell.chars()
10786 .filter(|ch| ch.is_alphanumeric())
10787 .flat_map(|ch| ch.to_lowercase())
10788 .collect()
10789 }
10790
10791 fn looks_like_header_row(line: &str) -> bool {
10792 let cells = pipe_cells(line);
10793 if cells.len() < 2 {
10794 return false;
10795 }
10796
10797 let non_empty = cells
10798 .iter()
10799 .filter(|cell| !cell.trim().is_empty())
10800 .collect::<Vec<_>>();
10801 if non_empty.len() < 2 {
10802 return false;
10803 }
10804
10805 let headerish = non_empty.iter().all(|cell| {
10806 let trimmed = cell.trim();
10807 let word_count = trimmed.split_whitespace().count();
10808 let has_alpha = trimmed.chars().any(|ch| ch.is_alphabetic());
10809 has_alpha && word_count <= 4 && trimmed.len() <= 28
10810 });
10811 headerish
10812 }
10813
10814 fn header_overlap_ratio(left: &str, right: &str) -> f64 {
10815 let left_cells = pipe_cells(left)
10816 .into_iter()
10817 .map(|cell| normalize_header_cell(&cell))
10818 .collect::<Vec<_>>();
10819 let right_cells = pipe_cells(right)
10820 .into_iter()
10821 .map(|cell| normalize_header_cell(&cell))
10822 .collect::<Vec<_>>();
10823 let width = left_cells.len().min(right_cells.len());
10824 if width == 0 {
10825 return 0.0;
10826 }
10827
10828 let matches = (0..width)
10829 .filter(|idx| {
10830 !left_cells[*idx].is_empty()
10831 && !right_cells[*idx].is_empty()
10832 && left_cells[*idx] == right_cells[*idx]
10833 })
10834 .count();
10835 matches as f64 / width as f64
10836 }
10837
10838 fn header_schema_matches(left: &str, right: &str) -> bool {
10839 let left_cells = pipe_cells(left)
10840 .into_iter()
10841 .map(|cell| normalize_header_cell(&cell))
10842 .collect::<Vec<_>>();
10843 let right_cells = pipe_cells(right)
10844 .into_iter()
10845 .map(|cell| normalize_header_cell(&cell))
10846 .collect::<Vec<_>>();
10847 if left_cells.len() != right_cells.len() || left_cells.len() < 2 {
10848 return false;
10849 }
10850
10851 let mut aligned_non_empty = 0usize;
10852 for (left, right) in left_cells.iter().zip(right_cells.iter()) {
10853 if left.is_empty() || right.is_empty() {
10854 continue;
10855 }
10856 aligned_non_empty += 1;
10857 if left != right {
10858 return false;
10859 }
10860 }
10861
10862 aligned_non_empty >= 2
10863 }
10864
10865 fn pad_pipe_row(line: &str, target_cols: usize) -> String {
10866 let t = line.trim();
10867 let current_cols = count_pipe_cols(t);
10868 if current_cols >= target_cols {
10869 return t.to_string();
10870 }
10871 let mut result = t.to_string();
10873 for _ in current_cols..target_cols {
10874 result.push_str(" |");
10875 }
10876 result
10877 }
10878
10879 struct Block {
10881 start: usize,
10882 sep: usize,
10883 end: usize, cols: usize,
10885 }
10886
10887 let mut blocks: Vec<Block> = Vec::new();
10888 let mut i = 0;
10889 while i < lines.len() {
10890 if i + 1 < lines.len() && is_pipe_row(lines[i]) && is_separator(lines[i + 1]) {
10891 let cols = count_pipe_cols(lines[i]);
10892 let sep = i + 1;
10893 let mut end = sep;
10894 let mut j = sep + 1;
10895 while j < lines.len() && is_pipe_row(lines[j]) && !is_separator(lines[j]) {
10896 end = j;
10897 j += 1;
10898 }
10899 blocks.push(Block {
10900 start: i,
10901 sep,
10902 end,
10903 cols,
10904 });
10905 i = end + 1;
10906 } else {
10907 i += 1;
10908 }
10909 }
10910
10911 if blocks.len() < 2 {
10912 return markdown.to_string();
10913 }
10914
10915 let mut merge_leader: Vec<Option<usize>> = vec![None; blocks.len()];
10921 let mut group_cols: Vec<usize> = blocks.iter().map(|b| b.cols).collect();
10922 for bi in 1..blocks.len() {
10923 let prev = &blocks[bi - 1];
10924 let curr = &blocks[bi];
10925 let gap_range = prev.end + 1..curr.start;
10926 let gap_all_blank = gap_range.clone().all(|li| lines[li].trim().is_empty());
10927 let leader_idx = merge_leader[bi - 1].unwrap_or(bi - 1);
10931 let effective_prev_cols = group_cols[leader_idx];
10932 let gap_heading_only = if !gap_all_blank && effective_prev_cols >= 2 && curr.cols >= 2 {
10933 let non_blank: Vec<usize> = gap_range
10934 .clone()
10935 .filter(|li| !lines[*li].trim().is_empty())
10936 .collect();
10937 !non_blank.is_empty()
10939 && non_blank.len() <= 2
10940 && non_blank.iter().all(|li| {
10941 let t = lines[*li].trim();
10942 t.starts_with('#') && t.len() < 100
10943 })
10944 } else {
10945 false
10946 };
10947 let gap_short_fragment =
10951 if !gap_all_blank && !gap_heading_only && effective_prev_cols >= 2 && curr.cols >= 2 {
10952 let non_blank: Vec<usize> = gap_range
10953 .clone()
10954 .filter(|li| !lines[*li].trim().is_empty())
10955 .collect();
10956 non_blank.len() == 1 && {
10957 let t = lines[non_blank[0]].trim();
10958 t.len() < 30
10959 && !t.starts_with('#')
10960 && !t.starts_with('-')
10961 && !t.starts_with('*')
10962 && !t.contains(':')
10963 && !t.contains("TABLE")
10964 }
10965 } else {
10966 false
10967 };
10968 let prev_has_header = looks_like_header_row(lines[prev.start]);
10969 let curr_has_header = curr.end >= curr.sep + 2 && looks_like_header_row(lines[curr.start]);
10970 let curr_has_distinct_header = prev_has_header
10971 && curr_has_header
10972 && !header_schema_matches(lines[prev.start], lines[curr.start])
10973 && (curr.cols != prev.cols
10974 || header_overlap_ratio(lines[prev.start], lines[curr.start]) < 1.0);
10975
10976 if (gap_all_blank || gap_heading_only || gap_short_fragment)
10977 && prev.cols > 0
10978 && curr.cols > 0
10979 && !curr_has_distinct_header
10980 {
10981 merge_leader[bi] = Some(leader_idx);
10982 if curr.cols > group_cols[leader_idx] {
10984 group_cols[leader_idx] = curr.cols;
10985 }
10986 }
10987 }
10988
10989 let mut pad_target: Vec<usize> = vec![0; blocks.len()];
10990 for bi in 0..blocks.len() {
10991 let leader = merge_leader[bi].unwrap_or(bi);
10992 pad_target[bi] = group_cols[leader];
10993 }
10994
10995 let mut skip = vec![false; lines.len()];
10999 let mut convert_to_pipe_row = vec![false; lines.len()];
11000 for (bi, leader) in merge_leader.iter().enumerate() {
11001 if leader.is_none() {
11002 continue;
11003 }
11004 let prev_end = blocks[bi - 1].end;
11005 let curr = &blocks[bi];
11006 for li in (prev_end + 1)..curr.start {
11007 if lines[li].trim().is_empty() {
11008 skip[li] = true;
11009 } else {
11010 convert_to_pipe_row[li] = true;
11012 }
11013 }
11014 skip[curr.sep] = true;
11016 }
11017
11018 let mut line_to_block: Vec<Option<usize>> = vec![None; lines.len()];
11020 for (bi, block) in blocks.iter().enumerate() {
11021 line_to_block[block.start..=block.end].fill(Some(bi));
11022 }
11023 for (bi, leader) in merge_leader.iter().enumerate() {
11025 if leader.is_none() {
11026 continue;
11027 }
11028 let prev_end = blocks[bi - 1].end;
11029 let curr = &blocks[bi];
11030 for li in (prev_end + 1)..curr.start {
11031 if convert_to_pipe_row[li] {
11032 line_to_block[li] = Some(bi - 1);
11033 }
11034 }
11035 }
11036
11037 let mut result = String::new();
11038 for (li, line) in lines.iter().enumerate() {
11039 if skip[li] {
11040 continue;
11041 }
11042 if convert_to_pipe_row[li] {
11043 let text = line.trim().trim_start_matches('#').trim();
11045 if let Some(bi) = line_to_block[li] {
11046 let target = pad_target[bi];
11047 if target > 0 && !text.is_empty() {
11048 result.push_str(&format!("| {} ", text));
11049 for _ in 1..target {
11050 result.push_str("| ");
11051 }
11052 result.push_str("|\n");
11053 continue;
11054 }
11055 }
11056 result.push_str(line);
11058 result.push('\n');
11059 continue;
11060 }
11061 if let Some(bi) = line_to_block[li] {
11062 let target = pad_target[bi];
11063 if target > 0 && is_pipe_row(line) && !is_separator(line) {
11064 result.push_str(&pad_pipe_row(line, target));
11065 result.push('\n');
11066 } else if target > 0 && is_separator(line) {
11067 result.push('|');
11068 for _ in 0..target {
11069 result.push_str(" --- |");
11070 }
11071 result.push('\n');
11072 } else {
11073 result.push_str(line);
11074 result.push('\n');
11075 }
11076 } else {
11077 result.push_str(line);
11078 result.push('\n');
11079 }
11080 }
11081
11082 result
11083}
11084
11085#[cfg(test)]
11086mod tests {
11087 use super::*;
11088 use crate::models::bbox::BoundingBox;
11089 use crate::models::chunks::TextChunk;
11090 use crate::models::content::ContentElement;
11091 use crate::models::enums::{PdfLayer, TextFormat, TextType};
11092 use crate::models::list::{ListBody, ListItem, ListLabel, PDFList};
11093 use crate::models::semantic::{SemanticHeading, SemanticParagraph, SemanticTextNode};
11094 use crate::models::table::{
11095 TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
11096 };
11097 use crate::models::text::{TextBlock, TextColumn, TextLine};
11098
11099 #[test]
11100 fn test_empty_doc() {
11101 let doc = PdfDocument::new("test.pdf".to_string());
11102 let md = to_markdown(&doc).unwrap();
11103 assert!(md.contains("No content extracted"));
11104 }
11105
11106 #[test]
11107 fn test_with_title() {
11108 let mut doc = PdfDocument::new("test.pdf".to_string());
11109 doc.title = Some("My Title".to_string());
11110 let md = to_markdown(&doc).unwrap();
11111 assert!(md.starts_with("# My Title\n"));
11112 }
11113
11114 #[test]
11115 fn test_empty_title_not_rendered() {
11116 let mut doc = PdfDocument::new("test.pdf".to_string());
11117 doc.title = Some(" ".to_string());
11118 let md = to_markdown(&doc).unwrap();
11119 assert!(
11120 !md.contains("# "),
11121 "Empty/whitespace title should not produce a heading"
11122 );
11123 }
11124
11125 #[test]
11126 fn test_repair_fragmented_words() {
11127 assert_eq!(
11128 repair_fragmented_words("Jurisdic tion Fore ign Req uire me nts"),
11129 "Jurisdiction Foreign Requirements"
11130 );
11131 }
11132
11133 #[test]
11134 fn test_normalize_common_ocr_text_repairs_units() {
11135 assert_eq!(
11136 normalize_common_ocr_text("10 ߤL at 37 C and -20 oC"),
11137 "10 μL at 37°C and -20°C"
11138 );
11139 }
11140
11141 #[cfg(not(target_arch = "wasm32"))]
11142 #[test]
11143 fn test_build_layout_anchor_rows_reconstructs_four_column_matrix() {
11144 let lines = vec![
11145 "Key Functions by Main Service Flow".to_string(),
11146 "".to_string(),
11147 " Service Stage Function Name Explanation Expected Benefit".to_string(),
11148 "".to_string(),
11149 " 1. Project creation Project creation and Select document type to automatically run project creation, Pipeline configuration with The intuitive UI environment allows the the person in charge to quickly proceed with".to_string(),
11150 "".to_string(),
11151 " management recommended Modelset and Endpoint deployment the entire process from project creation to deployment, improving work efficiency".to_string(),
11152 "".to_string(),
11153 " Conveniently manage raw data to be used for OCR Pack and actual date from live".to_string(),
11154 " 2. Data labeling and Data storage management Provides convenient functions for uploading raw data, viewer, and data management".to_string(),
11155 " (search using image metadata, sorting, filtering, hashtags settings on image data) service".to_string(),
11156 " fine-tuning".to_string(),
11157 " Image data bookmark for Qualitative Evaluation".to_string(),
11158 "".to_string(),
11159 " Create and manage Labeling Creating a Labeling Space to manage raw data annotation, managing labeling resources Labeling work can be outsourced within the pack. Labeled data is continuously".to_string(),
11160 " (Ontology, Characters to be Recognized), data set dump, data set version management supplied from which data sets can be created with ease. The Auto Labeling function".to_string(),
11161 " Space".to_string(),
11162 " 3 increases both efficiency and convenience.".to_string(),
11163 " Various basic models for each selected 5".to_string(),
11164 " document, information comparison between".to_string(),
11165 " Model training Providing a foundation for customers to implement, manage, and upgrade their own".to_string(),
11166 " models, basic model training, training pause function, re-training, cancel function, and OCR model specialized to the customers’ needs".to_string(),
11167 " configuration support for Characters to be Recognized and Ontology that is frequently".to_string(),
11168 " modified while developing specialized models".to_string(),
11169 ];
11170
11171 let header = find_layout_header_candidate(&lines).unwrap();
11172 let rows =
11173 build_layout_anchor_rows(&lines, &extract_layout_entries(&lines, &header)).unwrap();
11174
11175 assert_eq!(
11176 header.headers,
11177 vec![
11178 "Service Stage".to_string(),
11179 "Function Name".to_string(),
11180 "Explanation".to_string(),
11181 "Expected Benefit".to_string()
11182 ]
11183 );
11184 assert_eq!(rows.len(), 4);
11185 assert_eq!(rows[0][0], "1. Project creation");
11186 assert_eq!(rows[0][1], "Project creation and management");
11187 assert!(rows[1][0].contains("fine-tuning"));
11188 assert_eq!(rows[2][1], "Create and manage Labeling Space");
11189 assert_eq!(rows[3][1], "Model training");
11190 assert!(rows[3][2].contains("Various basic models for each selected document"));
11191 }
11192
11193 #[cfg(not(target_arch = "wasm32"))]
11194 #[test]
11195 fn test_build_layout_panel_stub_rows_reconstructs_left_stub_table() {
11196 let lines = vec![
11197 "AI Pack".to_string(),
11198 "Upstage offers 3 AI packs that process unstructured information and data".to_string(),
11199 "".to_string(),
11200 " OCR Recommendation Product semantic search".to_string(),
11201 "".to_string(),
11202 " A solution that recognizes characters in an A solution that recommends the best products and A solution that enables semantic search, analyzes and".to_string(),
11203 " image and extracts necessary information contents organizes key information in unstructured text data".to_string(),
11204 " Pack".to_string(),
11205 " into a standardized form (DB)".to_string(),
11206 "".to_string(),
11207 " Applicable to all fields that require text extraction Applicable to all fields that use any form of Applicable to all fields that deal with various types of".to_string(),
11208 " from standardized documents, such as receipts, recommendation including alternative products, unstructured data containing text information that".to_string(),
11209 "Application bills, credit cards, ID cards, certificates, and medical products and contents that are likely to be require semantic search and conversion into a DB".to_string(),
11210 " receipts purchased next".to_string(),
11211 "".to_string(),
11212 " Achieved 1st place in the OCR World Competition Team with specialists and technologies that Creation of the first natural language evaluation".to_string(),
11213 " The team includes specialists who have received Kaggle’s Gold Medal recommendation system in Korean (KLUE)".to_string(),
11214 " presented 14 papers in the world’s most (Education platform) World’s No.1 in Kaggle text embedding competition in".to_string(),
11215 " Highlight".to_string(),
11216 " renowned AI conferences Proven superior performance of more than 170% E-commerce subject (Shopee)".to_string(),
11217 " compared to other global top-tier recommendation".to_string(),
11218 " models".to_string(),
11219 ];
11220
11221 let header = find_layout_panel_header_candidate(&lines).unwrap();
11222 let rows = build_layout_panel_stub_rows(&lines, &header).unwrap();
11223
11224 assert_eq!(
11225 header.headers,
11226 vec![
11227 "OCR".to_string(),
11228 "Recommendation".to_string(),
11229 "Product semantic search".to_string()
11230 ]
11231 );
11232 assert_eq!(rows.len(), 3);
11233 assert_eq!(rows[0][0], "Pack");
11234 assert!(rows[0][1].contains("image and extracts necessary information"));
11235 assert_eq!(rows[1][0], "Application");
11236 assert!(rows[1][3].contains("require semantic search and conversion into a DB"));
11237 assert_eq!(rows[2][0], "Highlight");
11238 assert!(rows[2][2].contains("top-tier recommendation models"));
11239 }
11240
11241 #[cfg(not(target_arch = "wasm32"))]
11242 #[test]
11243 fn test_extract_layout_toc_entries_merges_wrapped_entry() {
11244 let lines = vec![
11245 "Table of Contents".to_string(),
11246 "".to_string(),
11247 "Executive Summary 4".to_string(),
11248 "Legal Framework 6".to_string(),
11249 "Election Administration 11".to_string(),
11250 "Civil Society Engagement 15".to_string(),
11251 "Political Parties, Candidates Registration and Election 18".to_string(),
11252 "Campaign".to_string(),
11253 "Media Freedom and Access to Information 25".to_string(),
11254 "Voter Education and Awareness 29".to_string(),
11255 "Participation of Marginalized Sectors 31".to_string(),
11256 "Recommendations 39".to_string(),
11257 ];
11258
11259 let (title, entries) = extract_layout_toc_entries(&lines).unwrap();
11260 assert_eq!(title, "Table of Contents");
11261 assert_eq!(entries.len(), 9);
11262 assert_eq!(entries[0].title, "Executive Summary");
11263 assert_eq!(entries[0].page, "4");
11264 assert_eq!(
11265 entries[4].title,
11266 "Political Parties, Candidates Registration and Election Campaign"
11267 );
11268 assert_eq!(entries[4].page, "18");
11269 }
11270
11271 #[cfg(not(target_arch = "wasm32"))]
11272 fn make_bbox_layout_line(words: &[(&str, f64, f64)], bottom: f64, top: f64) -> BBoxLayoutLine {
11273 make_bbox_layout_line_in_block(0, words, bottom, top)
11274 }
11275
11276 #[cfg(not(target_arch = "wasm32"))]
11277 fn make_bbox_layout_line_in_block(
11278 block_id: usize,
11279 words: &[(&str, f64, f64)],
11280 bottom: f64,
11281 top: f64,
11282 ) -> BBoxLayoutLine {
11283 BBoxLayoutLine {
11284 block_id,
11285 bbox: BoundingBox::new(
11286 Some(1),
11287 words.first().map(|(_, left, _)| *left).unwrap_or(72.0),
11288 bottom,
11289 words.last().map(|(_, _, right)| *right).unwrap_or(320.0),
11290 top,
11291 ),
11292 words: words
11293 .iter()
11294 .map(|(text, left, right)| BBoxLayoutWord {
11295 bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
11296 text: (*text).to_string(),
11297 })
11298 .collect(),
11299 }
11300 }
11301
11302 #[cfg(not(target_arch = "wasm32"))]
11303 #[test]
11304 fn test_detect_layout_open_plate_recovers_two_column_species_rows() {
11305 let lines = vec![
11306 make_bbox_layout_line(
11307 &[
11308 ("Fish", 60.0, 76.0),
11309 ("species", 78.0, 107.0),
11310 ("on", 109.0, 119.0),
11311 ("IUCN", 121.0, 142.0),
11312 ("Red", 144.0, 159.0),
11313 ("List", 161.0, 176.0),
11314 ],
11315 649.0,
11316 660.0,
11317 ),
11318 make_bbox_layout_line(
11319 &[("Potosi", 60.0, 84.0), ("Pupfish", 86.0, 114.0)],
11320 632.0,
11321 643.0,
11322 ),
11323 make_bbox_layout_line(
11324 &[("Cyprinodon", 132.0, 176.0), ("alvarezi", 178.0, 207.0)],
11325 632.0,
11326 643.0,
11327 ),
11328 make_bbox_layout_line(
11329 &[
11330 ("La", 60.0, 69.0),
11331 ("Palma", 71.0, 94.0),
11332 ("Pupfish", 96.0, 124.0),
11333 ("Cyprinodon", 132.0, 176.0),
11334 ("longidorsalis", 178.0, 224.0),
11335 ],
11336 616.0,
11337 627.0,
11338 ),
11339 make_bbox_layout_line(
11340 &[("Butterfly", 60.0, 94.0), ("Splitfin", 96.0, 123.0)],
11341 600.0,
11342 611.0,
11343 ),
11344 make_bbox_layout_line(
11345 &[("Ameca", 132.0, 156.0), ("splendens", 158.0, 194.0)],
11346 600.0,
11347 611.0,
11348 ),
11349 make_bbox_layout_line(
11350 &[("Golden", 60.0, 88.0), ("Skiffia", 90.0, 113.0)],
11351 584.0,
11352 595.0,
11353 ),
11354 make_bbox_layout_line(
11355 &[("Skiffia", 132.0, 155.0), ("francesae", 158.0, 193.0)],
11356 584.0,
11357 595.0,
11358 ),
11359 make_bbox_layout_line(
11360 &[
11361 ("Table", 56.0, 74.0),
11362 ("6.1:", 76.0, 87.0),
11363 ("Four", 89.0, 105.0),
11364 ("fish", 107.0, 119.0),
11365 ("species", 121.0, 145.0),
11366 ("on", 147.0, 155.0),
11367 ("IUCN", 157.0, 176.0),
11368 ("Red", 178.0, 190.0),
11369 ("List", 192.0, 205.0),
11370 ("held", 279.0, 293.0),
11371 ("in", 295.0, 302.0),
11372 ("public", 304.0, 325.0),
11373 ("aquariums.", 327.0, 365.0),
11374 ],
11375 556.0,
11376 566.0,
11377 ),
11378 ];
11379
11380 let plate = detect_layout_open_plate(576.0, &lines).unwrap();
11381 assert_eq!(plate.heading, "Fish species on IUCN Red List");
11382 assert_eq!(
11383 plate.header_row,
11384 vec![
11385 "Fish species on IUCN Red List".to_string(),
11386 "Scientific name".to_string()
11387 ]
11388 );
11389 assert_eq!(plate.rows.len(), 4);
11390 assert_eq!(
11391 plate.rows[1],
11392 vec![
11393 "La Palma Pupfish".to_string(),
11394 "Cyprinodon longidorsalis".to_string()
11395 ]
11396 );
11397 assert!(plate
11398 .caption
11399 .starts_with("Table 6.1: Four fish species on IUCN Red List"));
11400 }
11401
11402 #[cfg(not(target_arch = "wasm32"))]
11403 #[test]
11404 fn test_extract_layout_narrative_bridge_recovers_left_prose_and_defers_captions() {
11405 let plate = OpenPlateCandidate {
11406 heading: "Fish species on IUCN Red List".to_string(),
11407 header_row: vec![
11408 "Fish species on IUCN Red List".to_string(),
11409 "Scientific name".to_string(),
11410 ],
11411 rows: vec![],
11412 caption: "Table 6.1".to_string(),
11413 cutoff_top_y: 560.0,
11414 };
11415 let lines = vec![
11416 make_bbox_layout_line(
11417 &[
11418 ("Public", 56.0, 83.0),
11419 ("aquariums,", 88.0, 135.0),
11420 ("because", 140.0, 174.0),
11421 ],
11422 509.0,
11423 521.0,
11424 ),
11425 make_bbox_layout_line(
11426 &[
11427 ("of", 180.0, 188.0),
11428 ("their", 194.0, 214.0),
11429 ("in-", 220.0, 233.0),
11430 ],
11431 509.0,
11432 521.0,
11433 ),
11434 make_bbox_layout_line(
11435 &[
11436 ("house", 56.0, 82.0),
11437 ("expertise,", 84.0, 125.0),
11438 ("can", 128.0, 143.0),
11439 ],
11440 495.0,
11441 507.0,
11442 ),
11443 make_bbox_layout_line(
11444 &[("act", 146.0, 159.0), ("quickly", 161.0, 191.0)],
11445 495.0,
11446 507.0,
11447 ),
11448 make_bbox_layout_line_in_block(
11449 1,
11450 &[
11451 ("Figure", 242.0, 265.0),
11452 ("6.3:", 267.0, 280.0),
11453 ("Photo", 282.0, 303.0),
11454 ],
11455 355.0,
11456 366.0,
11457 ),
11458 make_bbox_layout_line_in_block(
11459 1,
11460 &[
11461 ("of", 305.0, 312.0),
11462 ("the", 314.0, 325.0),
11463 ("species.", 327.0, 360.0),
11464 ],
11465 355.0,
11466 366.0,
11467 ),
11468 make_bbox_layout_line(
11469 &[
11470 ("The", 56.0, 73.0),
11471 ("breeding", 77.0, 114.0),
11472 ("colonies", 118.0, 153.0),
11473 ],
11474 330.0,
11475 342.0,
11476 ),
11477 make_bbox_layout_line(
11478 &[
11479 ("of", 157.0, 165.0),
11480 ("the", 169.0, 183.0),
11481 ("Butterfly", 187.0, 224.0),
11482 ("Splitfin", 228.0, 258.0),
11483 ("at", 314.0, 323.0),
11484 ("the", 327.0, 341.0),
11485 ("London", 345.0, 377.0),
11486 ("Zoo", 381.0, 397.0),
11487 ("and", 401.0, 416.0),
11488 ("elsewhere", 420.0, 463.0),
11489 ("serve", 467.0, 489.0),
11490 ("as", 493.0, 502.0),
11491 ("ark", 506.0, 519.0),
11492 ],
11493 330.0,
11494 342.0,
11495 ),
11496 make_bbox_layout_line(
11497 &[
11498 ("Figure", 56.0, 79.0),
11499 ("6.4:", 81.0, 94.0),
11500 ("Lake", 96.0, 116.0),
11501 ("Sturgeon", 118.0, 158.0),
11502 ],
11503 104.0,
11504 116.0,
11505 ),
11506 ];
11507
11508 let bridge = extract_layout_narrative_bridge(576.0, &lines, &plate).unwrap();
11509 assert!(bridge
11510 .bridge_paragraph
11511 .as_deref()
11512 .is_some_and(|text| text.contains("Public aquariums") && text.contains("expertise")));
11513 assert_eq!(bridge.deferred_captions.len(), 2);
11514 assert!(bridge.deferred_captions[0].contains("Figure 6.3:"));
11515 assert!(bridge.deferred_captions[0].contains("species."));
11516 }
11517
11518 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11519 #[test]
11520 fn test_detect_layout_ocr_benchmark_dashboard_on_real_pdf() {
11521 let path =
11522 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000199.pdf");
11523 let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
11524 let dashboard = detect_layout_ocr_benchmark_dashboard(page_width, &lines).unwrap();
11525
11526 assert_eq!(
11527 dashboard.title,
11528 "Base Model Performance Evaluation of Upstage OCR Pack"
11529 );
11530 assert_eq!(dashboard.left_columns.len(), 2);
11531 assert_eq!(
11532 dashboard.left_columns[0],
11533 "Scene (Photographed document image)"
11534 );
11535 assert_eq!(
11536 dashboard.left_rows[0],
11537 vec![
11538 "Company A²".to_string(),
11539 "70.23".to_string(),
11540 "80.41".to_string()
11541 ]
11542 );
11543 assert_eq!(
11544 dashboard.right_rows[0],
11545 vec![
11546 "OCR-Recall³".to_string(),
11547 "73.2".to_string(),
11548 "94.2".to_string(),
11549 "94.1".to_string()
11550 ]
11551 );
11552 assert_eq!(dashboard.right_rows[3][0], "Parsing-F¹");
11553 assert_eq!(dashboard.right_rows[3][1], "68.0");
11554 assert_eq!(dashboard.right_rows[3][2], "82.65");
11555 assert_eq!(dashboard.right_rows[3][3], "82.65");
11556 assert!(!dashboard.definition_notes.is_empty());
11557 assert!(!dashboard.source_notes.is_empty());
11558 }
11559
11560 #[cfg(not(target_arch = "wasm32"))]
11561 #[test]
11562 fn test_split_layout_line_spans_handles_unicode_boundaries() {
11563 let line = "Title “Podcast #EP32: SDGs dan Anak Muda” 2024";
11564 let spans = split_layout_line_spans(line);
11565 assert_eq!(spans.len(), 3);
11566 assert_eq!(spans[0].1, "Title");
11567 assert!(spans[1].1.contains("Podcast #EP32: SDGs dan Anak Muda"));
11568 assert!(spans[1].1.ends_with('”'));
11569 assert!(spans[2].1.ends_with("24"));
11570 }
11571
11572 #[cfg(not(target_arch = "wasm32"))]
11573 #[test]
11574 fn test_render_layout_single_caption_chart_document_on_real_pdf() {
11575 let path =
11576 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000037.pdf");
11577 let doc = PdfDocument {
11578 title: None,
11579 source_path: Some(path.to_string_lossy().to_string()),
11580 number_of_pages: 1,
11581 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11582 .unwrap()
11583 .kids,
11584 ..PdfDocument::new("01030000000037.pdf".to_string())
11585 };
11586 let rendered = render_layout_single_caption_chart_document(&doc).unwrap();
11587 assert!(rendered.contains("# 3. Impact on Business Operations"));
11588 assert!(rendered.contains("## 3.1. Status of Business Operations"));
11589 assert!(rendered.contains("As shown in Figure 3.1.1, the number of MSMEs"));
11590 assert!(
11591 rendered.contains("Figure 3.1.1: Status of operations during each survey phase (%)")
11592 );
11593 assert!(
11594 rendered.contains("lockdown period. In the handicraft/textile sector, 30% of MSMEs")
11595 );
11596 assert!(!rendered.contains("| Lockdown Period |"));
11597 }
11598
11599 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11600 #[test]
11601 fn test_to_markdown_captioned_media_document_on_real_pdf_72() {
11602 let path =
11603 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000072.pdf");
11604 let doc = PdfDocument {
11605 title: None,
11606 source_path: Some(path.to_string_lossy().to_string()),
11607 number_of_pages: 1,
11608 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11609 .unwrap()
11610 .kids,
11611 ..PdfDocument::new("01030000000072.pdf".to_string())
11612 };
11613 let md = to_markdown(&doc).unwrap();
11614 assert!(md.contains("## Diagram 5"), "{md}");
11615 assert!(
11616 md.contains("**Distribution of Komnas HAM’s YouTube Content (2019-2020)**"),
11617 "{md}"
11618 );
11619 assert!(
11620 md.contains(
11621 "As of 1 December 2021, the Komnas HAM’s YouTube channel has 2,290 subscribers"
11622 ),
11623 "{md}"
11624 );
11625 assert!(md.contains("**Figure 4**"), "{md}");
11626 assert!(
11627 md.contains("*Komnas HAM’s YouTube channel as of 1 December 2021*"),
11628 "{md}"
11629 );
11630 }
11631
11632 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11633 #[test]
11634 fn test_to_markdown_captioned_media_document_on_real_pdf_73() {
11635 let path =
11636 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000073.pdf");
11637 let doc = PdfDocument {
11638 title: None,
11639 source_path: Some(path.to_string_lossy().to_string()),
11640 number_of_pages: 1,
11641 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11642 .unwrap()
11643 .kids,
11644 ..PdfDocument::new("01030000000073.pdf".to_string())
11645 };
11646 let md = to_markdown(&doc).unwrap();
11647 assert!(
11648 md.starts_with("# In this content, DPN Argentina provides a brief explanation"),
11649 "{md}"
11650 );
11651 assert!(
11652 md.contains("Examples of such greetings are as follows:"),
11653 "{md}"
11654 );
11655 assert!(md.contains("*Image*"), "{md}");
11656 assert!(md.contains("**Figure 6**"), "{md}");
11657 assert!(md.contains("**DPN Argentina**"), "{md}");
11658 assert!(
11659 md.contains("**Content: World Health Day Celebration (7 April 2021).**^98"),
11660 "{md}"
11661 );
11662 assert!(md.contains("**Footnote:**"), "{md}");
11663 assert!(
11664 md.contains("https://twitter.com/DPNArgentina/status/1379765916259483648."),
11665 "{md}"
11666 );
11667 }
11668
11669 #[cfg(not(target_arch = "wasm32"))]
11670 #[test]
11671 fn test_render_layout_captioned_media_document_does_not_fire_on_real_pdf_14() {
11672 let path =
11673 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
11674 let doc = PdfDocument {
11675 title: None,
11676 source_path: Some(path.to_string_lossy().to_string()),
11677 number_of_pages: 1,
11678 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11679 .unwrap()
11680 .kids,
11681 ..PdfDocument::new("01030000000014.pdf".to_string())
11682 };
11683 assert!(render_layout_captioned_media_document(&doc).is_none());
11684 }
11685
11686 #[cfg(not(target_arch = "wasm32"))]
11687 #[test]
11688 fn test_to_markdown_real_pdf_14_preserves_body_paragraphs() {
11689 let path =
11690 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
11691 let doc = PdfDocument {
11692 title: None,
11693 source_path: Some(path.to_string_lossy().to_string()),
11694 number_of_pages: 1,
11695 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11696 .unwrap()
11697 .kids,
11698 ..PdfDocument::new("01030000000014.pdf".to_string())
11699 };
11700 let md = to_markdown(&doc).unwrap();
11701 assert!(
11702 md.contains("These images also show that different areas are used by men and by women"),
11703 "{md}"
11704 );
11705 }
11706
11707 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11708 #[test]
11709 fn test_render_layout_recommendation_infographic_on_real_pdf() {
11710 let path =
11711 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000183.pdf");
11712 let doc = PdfDocument {
11713 title: None,
11714 source_path: Some(path.to_string_lossy().to_string()),
11715 number_of_pages: 1,
11716 kids: Vec::new(),
11717 ..PdfDocument::new("01030000000183.pdf".to_string())
11718 };
11719 let rendered = render_layout_recommendation_infographic_document(&doc).unwrap();
11720 assert!(rendered.contains("# Recommendation Pack: Track Record"));
11721 assert!(rendered.contains("## Comparison with Beauty Commerce Recommendation Models"));
11722 assert!(rendered.contains("| Graph-RecSys | 0.4048 |"));
11723 assert!(rendered.contains("| Current Service Recommendation Algorithm | 0.159 |"));
11724 assert!(rendered.contains("## Education Content Platform PoC Case"));
11725 assert!(rendered.contains("| DKT Model | 0.882 |"));
11726 assert!(rendered.contains("Compared to regular model"));
11727 }
11728
11729 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11730 #[test]
11731 fn test_render_layout_stacked_bar_report_on_real_pdf() {
11732 let path =
11733 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000038.pdf");
11734 let doc = PdfDocument {
11735 title: None,
11736 source_path: Some(path.to_string_lossy().to_string()),
11737 number_of_pages: 1,
11738 kids: Vec::new(),
11739 ..PdfDocument::new("01030000000038.pdf".to_string())
11740 };
11741 let rendered = render_layout_stacked_bar_report_document(&doc);
11742 if rendered.is_none() {
11743 let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
11744 let blocks = collect_bbox_layout_blocks(&lines);
11745 let figures = collect_layout_figure_captions(&blocks);
11746 let narrative = detect_layout_stacked_bar_narrative(&blocks);
11747 eprintln!("page_width={page_width} figures={}", figures.len());
11748 if let Some(first) = figures.first() {
11749 eprintln!("figure1={}", bbox_layout_block_text(first));
11750 }
11751 if let Some(second) = figures.get(1) {
11752 eprintln!("figure2={}", bbox_layout_block_text(second));
11753 }
11754 eprintln!("narrative={}", narrative.is_some());
11755 if let Some(narrative) = &narrative {
11756 eprintln!("heading={}", narrative.heading);
11757 eprintln!("paragraphs={}", narrative.paragraphs.len());
11758 eprintln!("footnote={:?}", narrative.footnote);
11759 }
11760 for block in &blocks {
11761 let text = bbox_layout_block_text(block);
11762 if text.contains("July")
11763 || text.contains("October")
11764 || text.contains("January")
11765 || text.contains("Will ")
11766 || text.contains("Don’t")
11767 || text.starts_with("6.2.")
11768 || text.starts_with("5.")
11769 {
11770 eprintln!(
11771 "block top={:.1} bottom={:.1} left={:.1} right={:.1} text={}",
11772 block.bbox.top_y,
11773 block.bbox.bottom_y,
11774 block.bbox.left_x,
11775 block.bbox.right_x,
11776 text
11777 );
11778 }
11779 }
11780 if figures.len() >= 2 {
11781 let first = detect_layout_three_month_stacked_figure(
11782 &blocks,
11783 &lines,
11784 page_width,
11785 figures[0].clone(),
11786 figures[1].bbox.top_y,
11787 );
11788 eprintln!("figure_one_ok={}", first.is_some());
11789 if let Some(narrative) = &narrative {
11790 let second = detect_layout_sector_bar_figure(
11791 &blocks,
11792 &lines,
11793 page_width,
11794 figures[1].clone(),
11795 narrative.top_y,
11796 );
11797 eprintln!("figure_two_ok={}", second.is_some());
11798 }
11799 }
11800 }
11801 let rendered = rendered.unwrap();
11802 assert!(rendered.contains("# Figure 6.1.1:"));
11803 assert!(rendered.contains("| Will not terminate employment | 51 | 81 | 73 |"));
11804 assert!(rendered.contains("# 6.2. Expectations for Re-Hiring Employees"));
11805 }
11806
11807 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11808 #[test]
11809 fn test_render_layout_multi_figure_chart_document_on_real_pdf() {
11810 let path =
11811 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000076.pdf");
11812 let doc = PdfDocument {
11813 title: None,
11814 source_path: Some(path.to_string_lossy().to_string()),
11815 number_of_pages: 1,
11816 kids: Vec::new(),
11817 ..PdfDocument::new("01030000000076.pdf".to_string())
11818 };
11819 let rendered = render_layout_multi_figure_chart_document(&doc).unwrap();
11820 assert!(rendered.contains("# Figures from the Document"));
11821 assert!(
11822 rendered.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
11823 );
11824 assert!(rendered.contains("| 2016 | 3,230 |"));
11825 assert!(rendered.contains("| 2021 | 2,693 |"));
11826 assert!(
11827 rendered.contains("## Figure 1.8. Singapore foreign workforce stock (in thousands)")
11828 );
11829 assert!(rendered.contains("| 2016 (Dec) | 1,393 |"));
11830 assert!(rendered.contains("| 2021 (Dec) | 1,200 |"));
11831 assert!(rendered.contains(
11832 "Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate."
11833 ));
11834 }
11835
11836 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11837 #[test]
11838 fn test_render_layout_open_plate_document_on_real_pdf() {
11839 let path =
11840 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
11841 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11842 let rendered = render_layout_open_plate_document(&doc).unwrap();
11843 assert!(rendered.contains("# Fish species on IUCN Red List"));
11844 assert!(rendered.contains("| Potosi Pupfish | Cyprinodon alvarezi |"));
11845 assert!(rendered.contains("| Golden Skiffia | Skiffia francesae |"));
11846 assert!(rendered.contains("*Table 6.1: Four fish species on IUCN Red List"));
11847 assert!(rendered.contains("---"));
11848 assert!(rendered.contains("Public aquariums, because of their inhouse expertise"));
11849 }
11850
11851 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11852 #[test]
11853 fn test_to_markdown_open_plate_document_on_real_pdf() {
11854 let path =
11855 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
11856 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11857 let md = to_markdown(&doc).unwrap();
11858
11859 assert!(md.contains("# Fish species on IUCN Red List"), "{md}");
11860 assert!(
11861 md.contains("| Potosi Pupfish | Cyprinodon alvarezi |"),
11862 "{md}"
11863 );
11864 assert!(
11865 md.contains("| Golden Skiffia | Skiffia francesae |"),
11866 "{md}"
11867 );
11868 assert!(
11869 md.contains("*Table 6.1: Four fish species on IUCN Red List"),
11870 "{md}"
11871 );
11872 assert!(
11873 md.contains("The breeding colonies of the Butterfly Splitfin"),
11874 "{md}"
11875 );
11876 }
11877
11878 #[cfg(not(target_arch = "wasm32"))]
11879 #[test]
11880 fn test_to_markdown_does_not_misclassify_open_plate_pdf_36() {
11881 let path =
11882 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000036.pdf");
11883 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11884 let md = to_markdown(&doc).unwrap();
11885
11886 assert!(md.contains("# 2. General Profile of MSMEs"), "{md}");
11887 assert!(
11888 md.contains("In July 2020, the survey established a general profile"),
11889 "{md}"
11890 );
11891 assert!(
11892 md.contains(
11893 "The tourism sub-sectors interviewed included lodging, restaurants and bars"
11894 ),
11895 "{md}"
11896 );
11897 assert!(
11898 !md.starts_with("# Business characteristics. Business size was"),
11899 "{md}"
11900 );
11901 }
11902
11903 #[cfg(not(target_arch = "wasm32"))]
11904 #[test]
11905 fn test_to_markdown_does_not_misclassify_open_plate_pdf_40() {
11906 let path =
11907 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000040.pdf");
11908 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11909 let md = to_markdown(&doc).unwrap();
11910
11911 assert!(
11912 md.contains(
11913 "Thailand, Philippines and Indonesia in particular, identifying known experts"
11914 ),
11915 "{md}"
11916 );
11917 assert!(
11918 md.contains("Figure 1: Age by gender of respondents"),
11919 "{md}"
11920 );
11921 assert!(md.contains("Gender Analysis of Violent Extremism"), "{md}");
11922 assert!(
11923 !md.starts_with("# Thailand, Philippines and Indonesia in"),
11924 "{md}"
11925 );
11926 }
11927
11928 #[cfg(not(target_arch = "wasm32"))]
11929 #[test]
11930 fn test_to_markdown_does_not_misclassify_open_plate_pdf_64() {
11931 let path =
11932 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000064.pdf");
11933 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11934 let md = to_markdown(&doc).unwrap();
11935
11936 assert!(md.contains("estuarine influenced areas."), "{md}");
11937 assert!(md.contains("| MANILA | 2454 | 6,125 |"), "{md}");
11938 assert!(
11939 md.contains("The port of Manila has been documented"),
11940 "{md}"
11941 );
11942 assert!(!md.starts_with("# CAGAYAN DE ORO"), "{md}");
11943 }
11944
11945 #[cfg(not(target_arch = "wasm32"))]
11946 #[test]
11947 fn test_detect_footnote_citation_regions_on_real_pdf() {
11948 let path =
11949 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
11950 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11951 let regions = detect_footnote_citation_regions(&doc);
11952 assert!(!regions.is_empty(), "{regions:?}");
11953 assert!(
11954 regions.iter().any(|region| {
11955 region.rendered.contains("<table>")
11956 && region.rendered.contains("<td>25</td>")
11957 && region.rendered.contains("<td>29</td>")
11958 }),
11959 "{regions:#?}"
11960 );
11961 assert!(
11962 regions.iter().any(|region| {
11963 region.rendered.contains("<table>")
11964 && region.rendered.contains("<td>30</td>")
11965 && region.rendered.contains("<td>33</td>")
11966 }),
11967 "{regions:#?}"
11968 );
11969 }
11970
11971 #[cfg(not(target_arch = "wasm32"))]
11972 #[test]
11973 fn test_to_markdown_renders_footnote_citation_tables_on_real_pdf() {
11974 let path =
11975 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
11976 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11977 let md = to_markdown(&doc).unwrap();
11978
11979 assert!(md.contains("<table>"), "{md}");
11980 assert!(md.contains("<th>Footnote</th><th>Citation</th>"), "{md}");
11981 assert!(md.contains("<td>25</td><td>Wiliam Beckford"), "{md}");
11982 assert!(
11983 md.contains("<td>29</td><td>Pope, The Rape of the Lock, 69.</td>"),
11984 "{md}"
11985 );
11986 assert!(
11987 md.contains("<td>30</td><td>Beawes, Lex Mercatoria Rediviva, 791.</td>"),
11988 "{md}"
11989 );
11990 assert!(
11991 md.contains("<td>32</td><td>Beawes, Lex Mercatoria Rediviva, 792.</td>"),
11992 "{md}"
11993 );
11994 assert!(
11995 md.contains("<td>33</td><td>M.M., Pharmacopoia Reformata:"),
11996 "{md}"
11997 );
11998 }
11999
12000 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12001 #[test]
12002 fn test_to_markdown_projection_sheet_document_on_real_pdf() {
12003 let path =
12004 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000128.pdf");
12005 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12006 let md = to_markdown(&doc).unwrap();
12007
12008 assert!(md.contains("# Table and Figure from the Document"), "{md}");
12009 assert!(md.contains("| A | B | C | D | E |"), "{md}");
12010 assert!(
12011 md.contains("| 10 | 8 | 19.73214458 | 17.99 | 21.47 |"),
12012 "{md}"
12013 );
12014 assert!(
12015 md.contains("**Figure 13.3. Graph of Projection Estimates**"),
12016 "{md}"
12017 );
12018 assert!(md.contains("[Open Template in Microsoft Excel](#)"), "{md}");
12019 assert!(
12020 md.contains("*298 | Ch. 13. Homogeneous Investment Types*"),
12021 "{md}"
12022 );
12023 }
12024
12025 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12026 #[test]
12027 fn test_to_markdown_appendix_tables_document_on_real_pdf() {
12028 let path =
12029 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000082.pdf");
12030 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12031 let md = to_markdown(&doc).unwrap();
12032
12033 assert!(md.contains("# Appendices"), "{md}");
12034 assert!(
12035 md.contains("## TABLE 28: BREAKDOWN OF IMPRISONMENT CLAUSES IN STATE LAWS"),
12036 "{md}"
12037 );
12038 assert!(md.contains("| Imprisonment terms | Number of clauses | Percentage of all states | Percentage of total |"), "{md}");
12039 assert!(
12040 md.contains("| Less than 3 months | 4,448 | 21.3% | 17.0% |"),
12041 "{md}"
12042 );
12043 assert!(
12044 md.contains("## TABLE 29: STATES WITH MORE THAN 1,000 IMPRISONMENT CLAUSES"),
12045 "{md}"
12046 );
12047 assert!(
12048 md.contains(
12049 "| State | Number of clauses | GSDP (In Rs lakh crore) | GSDP (In $ billion) |"
12050 ),
12051 "{md}"
12052 );
12053 assert!(md.contains("| Gujarat | 1469 | 15.6 | 200.4 |"), "{md}");
12054 assert!(
12055 md.contains("*Sources: TeamLease Regtech, and Reserve Bank of India for GSDPs*"),
12056 "{md}"
12057 );
12058 assert!(md.contains("*Exchange rate: Rs 75 to USD*"), "{md}");
12059 }
12060
12061 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12062 #[test]
12063 fn test_to_markdown_titled_dual_table_document_on_real_pdf() {
12064 let path =
12065 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000084.pdf");
12066 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12067 let md = to_markdown(&doc).unwrap();
12068
12069 assert!(md.starts_with("# Jailed for Doing Business"), "{md}");
12070 assert!(
12071 md.contains("## TABLE 38: THREE CASE STUDIES ON NBFC COMPLIANCES*"),
12072 "{md}"
12073 );
12074 assert!(
12075 md.contains("| Percentage of imprisonment clauses | 20% | 30% | 37% |"),
12076 "{md}"
12077 );
12078 assert!(
12079 md.contains("## TABLE 39: BREAKDOWN OF IMPRISONMENT CLAUSES IN NBFC CASE STUDIES*"),
12080 "{md}"
12081 );
12082 assert!(
12083 md.contains("| 5 years to 10 years | 19 | 19 | 19 |"),
12084 "{md}"
12085 );
12086 assert!(
12087 md.contains("*These are real data from three NBFCs*"),
12088 "{md}"
12089 );
12090 }
12091
12092 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12093 #[test]
12094 fn test_to_markdown_registration_report_document_on_real_pdf() {
12095 let path =
12096 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000047.pdf");
12097 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12098 let md = to_markdown(&doc).unwrap();
12099
12100 assert!(
12101 md.starts_with("# ANFREL Pre-Election Assessment Mission Report"),
12102 "{md}"
12103 );
12104 assert!(
12105 md.contains(
12106 "| 14 | Cambodian Indigeneous Peoples Democracy Party | 19 | 194 | 19 | 202 | +8 |"
12107 ),
12108 "{md}"
12109 );
12110 assert!(
12111 md.contains("| | Total | | 84,208 | | 86,092 | +1,884 |"),
12112 "{md}"
12113 );
12114 assert!(!md.contains("| | Democracy Party |"), "{md}");
12115 }
12116
12117 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12118 #[test]
12119 fn test_to_markdown_dual_table_article_document_on_real_pdf() {
12120 let path =
12121 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000190.pdf");
12122 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12123 let md = to_markdown(&doc).unwrap();
12124
12125 assert!(
12126 md.starts_with("# Table 6: Performance comparison amongst the merge candidates"),
12127 "{md}"
12128 );
12129 assert!(
12130 md.contains("*Table 6*: Performance comparison amongst the merge candidates."),
12131 "{md}"
12132 );
12133 assert!(md.contains("# Table 7: Ablation studies on the different merge methods used for obtaining the final model"), "{md}");
12134 assert!(!md.contains("*Table 6*: Table 6:"), "{md}");
12135 assert!(!md.contains("| Merge v1"), "{md}");
12136 }
12137
12138 #[test]
12139 fn test_normalize_list_text_strips_redundant_bullets() {
12140 assert_eq!(
12141 normalize_list_text("• Collected via surveys"),
12142 "Collected via surveys"
12143 );
12144 assert!(is_pure_bullet_marker("•"));
12145 }
12146
12147 #[test]
12148 fn test_reference_continuation_detected() {
12149 assert!(should_merge_paragraph_text(
12150 "Scaling laws for transfer.",
12151 "arXiv preprint arXiv:2102.01293."
12152 ));
12153 }
12154
12155 #[test]
12156 fn test_enumerated_markers_are_detected() {
12157 assert!(starts_with_enumerated_marker("iii. Third item"));
12158 assert!(starts_with_enumerated_marker("1) First item"));
12159 assert!(starts_with_enumerated_marker("a. Lettered item"));
12160 assert!(!starts_with_enumerated_marker("Figure 1. Caption"));
12161 assert!(!starts_with_enumerated_marker("Natural dispersal"));
12162 }
12163
12164 fn make_heading(text: &str) -> ContentElement {
12165 let bbox = BoundingBox::new(Some(1), 72.0, 700.0, 300.0, 712.0);
12166 let chunk = TextChunk {
12167 value: text.to_string(),
12168 bbox: bbox.clone(),
12169 font_name: "Lato-Bold".to_string(),
12170 font_size: 12.0,
12171 font_weight: 700.0,
12172 italic_angle: 0.0,
12173 font_color: "#000000".to_string(),
12174 contrast_ratio: 21.0,
12175 symbol_ends: vec![],
12176 text_format: TextFormat::Normal,
12177 text_type: TextType::Regular,
12178 pdf_layer: PdfLayer::Main,
12179 ocg_visible: true,
12180 index: None,
12181 page_number: Some(1),
12182 level: None,
12183 mcid: None,
12184 };
12185 let line = TextLine {
12186 bbox: bbox.clone(),
12187 index: None,
12188 level: None,
12189 font_size: 12.0,
12190 base_line: 702.0,
12191 slant_degree: 0.0,
12192 is_hidden_text: false,
12193 text_chunks: vec![chunk],
12194 is_line_start: true,
12195 is_line_end: true,
12196 is_list_line: false,
12197 connected_line_art_label: None,
12198 };
12199 let block = TextBlock {
12200 bbox: bbox.clone(),
12201 index: None,
12202 level: None,
12203 font_size: 12.0,
12204 base_line: 702.0,
12205 slant_degree: 0.0,
12206 is_hidden_text: false,
12207 text_lines: vec![line],
12208 has_start_line: true,
12209 has_end_line: true,
12210 text_alignment: None,
12211 };
12212 let column = TextColumn {
12213 bbox: bbox.clone(),
12214 index: None,
12215 level: None,
12216 font_size: 12.0,
12217 base_line: 702.0,
12218 slant_degree: 0.0,
12219 is_hidden_text: false,
12220 text_blocks: vec![block],
12221 };
12222 ContentElement::Heading(SemanticHeading {
12223 base: SemanticParagraph {
12224 base: SemanticTextNode {
12225 bbox,
12226 index: None,
12227 level: None,
12228 semantic_type: crate::models::enums::SemanticType::Heading,
12229 correct_semantic_score: None,
12230 columns: vec![column],
12231 font_weight: Some(700.0),
12232 font_size: Some(12.0),
12233 text_color: None,
12234 italic_angle: None,
12235 font_name: Some("Lato-Bold".to_string()),
12236 text_format: None,
12237 max_font_size: Some(12.0),
12238 background_color: None,
12239 is_hidden_text: false,
12240 },
12241 enclosed_top: false,
12242 enclosed_bottom: false,
12243 indentation: 0,
12244 },
12245 heading_level: Some(1),
12246 })
12247 }
12248
12249 fn make_heading_at(left: f64, bottom: f64, right: f64, top: f64, text: &str) -> ContentElement {
12250 let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
12251 let chunk = TextChunk {
12252 value: text.to_string(),
12253 bbox: bbox.clone(),
12254 font_name: "Lato-Bold".to_string(),
12255 font_size: top - bottom,
12256 font_weight: 700.0,
12257 italic_angle: 0.0,
12258 font_color: "#000000".to_string(),
12259 contrast_ratio: 21.0,
12260 symbol_ends: vec![],
12261 text_format: TextFormat::Normal,
12262 text_type: TextType::Regular,
12263 pdf_layer: PdfLayer::Main,
12264 ocg_visible: true,
12265 index: None,
12266 page_number: Some(1),
12267 level: None,
12268 mcid: None,
12269 };
12270 let line = TextLine {
12271 bbox: bbox.clone(),
12272 index: None,
12273 level: None,
12274 font_size: top - bottom,
12275 base_line: bottom + 2.0,
12276 slant_degree: 0.0,
12277 is_hidden_text: false,
12278 text_chunks: vec![chunk],
12279 is_line_start: true,
12280 is_line_end: true,
12281 is_list_line: false,
12282 connected_line_art_label: None,
12283 };
12284 let block = TextBlock {
12285 bbox: bbox.clone(),
12286 index: None,
12287 level: None,
12288 font_size: top - bottom,
12289 base_line: bottom + 2.0,
12290 slant_degree: 0.0,
12291 is_hidden_text: false,
12292 text_lines: vec![line],
12293 has_start_line: true,
12294 has_end_line: true,
12295 text_alignment: None,
12296 };
12297 let column = TextColumn {
12298 bbox: bbox.clone(),
12299 index: None,
12300 level: None,
12301 font_size: top - bottom,
12302 base_line: bottom + 2.0,
12303 slant_degree: 0.0,
12304 is_hidden_text: false,
12305 text_blocks: vec![block],
12306 };
12307 ContentElement::Heading(SemanticHeading {
12308 base: SemanticParagraph {
12309 base: SemanticTextNode {
12310 bbox,
12311 index: None,
12312 level: None,
12313 semantic_type: crate::models::enums::SemanticType::Heading,
12314 correct_semantic_score: None,
12315 columns: vec![column],
12316 font_weight: Some(700.0),
12317 font_size: Some(top - bottom),
12318 text_color: None,
12319 italic_angle: None,
12320 font_name: Some("Lato-Bold".to_string()),
12321 text_format: None,
12322 max_font_size: Some(top - bottom),
12323 background_color: None,
12324 is_hidden_text: false,
12325 },
12326 enclosed_top: false,
12327 enclosed_bottom: false,
12328 indentation: 0,
12329 },
12330 heading_level: None,
12331 })
12332 }
12333
12334 fn make_paragraph(text: &str, bottom: f64, top: f64) -> ContentElement {
12335 make_paragraph_at(72.0, bottom, 300.0, top, text)
12336 }
12337
12338 fn make_paragraph_at(
12339 left: f64,
12340 bottom: f64,
12341 right: f64,
12342 top: f64,
12343 text: &str,
12344 ) -> ContentElement {
12345 let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
12346 let chunk = TextChunk {
12347 value: text.to_string(),
12348 bbox: bbox.clone(),
12349 font_name: "Lato-Regular".to_string(),
12350 font_size: (top - bottom).max(1.0),
12351 font_weight: 400.0,
12352 italic_angle: 0.0,
12353 font_color: "#000000".to_string(),
12354 contrast_ratio: 21.0,
12355 symbol_ends: vec![],
12356 text_format: TextFormat::Normal,
12357 text_type: TextType::Regular,
12358 pdf_layer: PdfLayer::Main,
12359 ocg_visible: true,
12360 index: None,
12361 page_number: Some(1),
12362 level: None,
12363 mcid: None,
12364 };
12365 let line = TextLine {
12366 bbox: bbox.clone(),
12367 index: None,
12368 level: None,
12369 font_size: chunk.font_size,
12370 base_line: bottom + 2.0,
12371 slant_degree: 0.0,
12372 is_hidden_text: false,
12373 text_chunks: vec![chunk],
12374 is_line_start: true,
12375 is_line_end: true,
12376 is_list_line: false,
12377 connected_line_art_label: None,
12378 };
12379 let block = TextBlock {
12380 bbox: bbox.clone(),
12381 index: None,
12382 level: None,
12383 font_size: line.font_size,
12384 base_line: line.base_line,
12385 slant_degree: 0.0,
12386 is_hidden_text: false,
12387 text_lines: vec![line],
12388 has_start_line: true,
12389 has_end_line: true,
12390 text_alignment: None,
12391 };
12392 let column = TextColumn {
12393 bbox: bbox.clone(),
12394 index: None,
12395 level: None,
12396 font_size: block.font_size,
12397 base_line: block.base_line,
12398 slant_degree: 0.0,
12399 is_hidden_text: false,
12400 text_blocks: vec![block],
12401 };
12402 ContentElement::Paragraph(SemanticParagraph {
12403 base: SemanticTextNode {
12404 bbox,
12405 index: None,
12406 level: None,
12407 semantic_type: crate::models::enums::SemanticType::Paragraph,
12408 correct_semantic_score: None,
12409 columns: vec![column],
12410 font_weight: Some(400.0),
12411 font_size: Some(top - bottom),
12412 text_color: None,
12413 italic_angle: None,
12414 font_name: Some("Lato-Regular".to_string()),
12415 text_format: None,
12416 max_font_size: Some(top - bottom),
12417 background_color: None,
12418 is_hidden_text: false,
12419 },
12420 enclosed_top: false,
12421 enclosed_bottom: false,
12422 indentation: 0,
12423 })
12424 }
12425
12426 fn make_fallback_list(items: &[&str]) -> ContentElement {
12427 let mut list_items = Vec::new();
12428 for (idx, text) in items.iter().enumerate() {
12429 let top = 700.0 - idx as f64 * 18.0;
12430 let bottom = top - 12.0;
12431 let bbox = BoundingBox::new(Some(1), 72.0, bottom, 320.0, top);
12432 list_items.push(ListItem {
12433 bbox: bbox.clone(),
12434 index: None,
12435 level: None,
12436 label: ListLabel {
12437 bbox: bbox.clone(),
12438 content: vec![],
12439 semantic_type: None,
12440 },
12441 body: ListBody {
12442 bbox: bbox.clone(),
12443 content: vec![],
12444 semantic_type: None,
12445 },
12446 label_length: 0,
12447 contents: vec![make_paragraph_at(72.0, bottom, 320.0, top, text)],
12448 semantic_type: None,
12449 });
12450 }
12451
12452 ContentElement::List(PDFList {
12453 bbox: BoundingBox::new(
12454 Some(1),
12455 72.0,
12456 700.0 - items.len() as f64 * 18.0,
12457 320.0,
12458 700.0,
12459 ),
12460 index: None,
12461 level: None,
12462 list_items,
12463 numbering_style: Some("bullets".to_string()),
12464 common_prefix: None,
12465 previous_list_id: None,
12466 next_list_id: None,
12467 })
12468 }
12469
12470 fn make_toc_table(rows: &[(&str, &str)]) -> ContentElement {
12471 let mut table_rows = Vec::new();
12472 for (ri, (title, page)) in rows.iter().enumerate() {
12473 let top = 680.0 - ri as f64 * 18.0;
12474 let bottom = top - 12.0;
12475 let left_bbox = BoundingBox::new(Some(1), 72.0, bottom, 280.0, top);
12476 let right_bbox = BoundingBox::new(Some(1), 320.0, bottom, 360.0, top);
12477 table_rows.push(TableBorderRow {
12478 bbox: BoundingBox::new(Some(1), 72.0, bottom, 360.0, top),
12479 index: None,
12480 level: None,
12481 row_number: ri,
12482 cells: vec![
12483 TableBorderCell {
12484 bbox: left_bbox.clone(),
12485 index: None,
12486 level: None,
12487 row_number: ri,
12488 col_number: 0,
12489 row_span: 1,
12490 col_span: 1,
12491 content: vec![TableToken {
12492 base: TextChunk {
12493 value: (*title).to_string(),
12494 bbox: left_bbox,
12495 font_name: "Lato-Regular".to_string(),
12496 font_size: 10.0,
12497 font_weight: 400.0,
12498 italic_angle: 0.0,
12499 font_color: "#000000".to_string(),
12500 contrast_ratio: 21.0,
12501 symbol_ends: vec![],
12502 text_format: TextFormat::Normal,
12503 text_type: TextType::Regular,
12504 pdf_layer: PdfLayer::Main,
12505 ocg_visible: true,
12506 index: None,
12507 page_number: Some(1),
12508 level: None,
12509 mcid: None,
12510 },
12511 token_type: TableTokenType::Text,
12512 }],
12513 contents: vec![],
12514 semantic_type: None,
12515 },
12516 TableBorderCell {
12517 bbox: right_bbox.clone(),
12518 index: None,
12519 level: None,
12520 row_number: ri,
12521 col_number: 1,
12522 row_span: 1,
12523 col_span: 1,
12524 content: vec![TableToken {
12525 base: TextChunk {
12526 value: (*page).to_string(),
12527 bbox: right_bbox,
12528 font_name: "Lato-Regular".to_string(),
12529 font_size: 10.0,
12530 font_weight: 400.0,
12531 italic_angle: 0.0,
12532 font_color: "#000000".to_string(),
12533 contrast_ratio: 21.0,
12534 symbol_ends: vec![],
12535 text_format: TextFormat::Normal,
12536 text_type: TextType::Regular,
12537 pdf_layer: PdfLayer::Main,
12538 ocg_visible: true,
12539 index: None,
12540 page_number: Some(1),
12541 level: None,
12542 mcid: None,
12543 },
12544 token_type: TableTokenType::Text,
12545 }],
12546 contents: vec![],
12547 semantic_type: None,
12548 },
12549 ],
12550 semantic_type: None,
12551 });
12552 }
12553
12554 ContentElement::TableBorder(TableBorder {
12555 bbox: BoundingBox::new(Some(1), 72.0, 620.0, 360.0, 680.0),
12556 index: None,
12557 level: Some("1".to_string()),
12558 x_coordinates: vec![72.0, 320.0, 360.0],
12559 x_widths: vec![0.0, 0.0, 0.0],
12560 y_coordinates: vec![680.0, 662.0, 644.0, 626.0],
12561 y_widths: vec![0.0, 0.0, 0.0, 0.0],
12562 rows: table_rows,
12563 num_rows: rows.len(),
12564 num_columns: 2,
12565 is_bad_table: false,
12566 is_table_transformer: false,
12567 previous_table: None,
12568 next_table: None,
12569 })
12570 }
12571
12572 #[test]
12573 fn test_contents_document_renders_toc_table_rows() {
12574 let mut doc = PdfDocument::new("contents.pdf".to_string());
12575 doc.kids.push(make_heading("CONTENTS"));
12576 doc.kids.push(make_toc_table(&[
12577 ("Experiment #1: Hydrostatic Pressure", "3"),
12578 ("Experiment #2: Bernoulli's Theorem Demonstration", "13"),
12579 ("Experiment #3: Energy Loss in Pipe Fittings", "24"),
12580 ("Experiment #4: Energy Loss in Pipes", "33"),
12581 ("Experiment #5: Impact of a Jet", "43"),
12582 ("Experiment #6: Orifice and Free Jet Flow", "50"),
12583 ("Experiment #7: Osborne Reynolds' Demonstration", "59"),
12584 ("References", "101"),
12585 ]));
12586
12587 let md = to_markdown(&doc).unwrap();
12588 assert!(md.starts_with("# CONTENTS\n\n"));
12589 assert!(md.contains("- Experiment #1: Hydrostatic Pressure 3\n"));
12590 assert!(md.contains("- Experiment #2: Bernoulli's Theorem Demonstration 13\n"));
12591 assert!(md.contains("- Experiment #7: Osborne Reynolds' Demonstration 59\n"));
12592 assert!(md.contains("- References 101\n"));
12593 }
12594
12595 #[test]
12596 fn test_toc_semantic_paragraphs_render_without_blank_lines() {
12597 let mut doc = PdfDocument::new("toc-semantic.pdf".to_string());
12598 let mut first = make_paragraph(
12599 "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
12600 700.0,
12601 712.0,
12602 );
12603 let mut second = make_paragraph("Section 5.1: The Linear Model 35", 684.0, 696.0);
12604 if let ContentElement::Paragraph(p) = &mut first {
12605 p.base.semantic_type = SemanticType::TableOfContent;
12606 }
12607 if let ContentElement::Paragraph(p) = &mut second {
12608 p.base.semantic_type = SemanticType::TableOfContent;
12609 }
12610 doc.kids.push(first);
12611 doc.kids.push(second);
12612
12613 let md = to_markdown(&doc).unwrap();
12614 assert!(md.contains(
12615 "Part V. Chapter Five - Comparing Associations Between Multiple Variables\nSection 5.1: The Linear Model 35\n"
12616 ));
12617 }
12618
12619 #[test]
12620 fn test_compact_toc_document_renders_without_blank_lines() {
12621 let mut doc = PdfDocument::new("compact-toc.pdf".to_string());
12622 doc.kids.push(make_paragraph(
12623 "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
12624 700.0,
12625 712.0,
12626 ));
12627 doc.kids.push(make_paragraph(
12628 "Section 5.1: The Linear Model 35",
12629 684.0,
12630 696.0,
12631 ));
12632 doc.kids.push(make_paragraph(
12633 "Part VI. Chapter Six - Comparing Three or More Group Means",
12634 668.0,
12635 680.0,
12636 ));
12637 doc.kids.push(make_paragraph(
12638 "Section 6.1: Between Versus Within Group Analyses 49",
12639 652.0,
12640 664.0,
12641 ));
12642 doc.kids.push(make_paragraph(
12643 "Part VII. Chapter Seven - Moderation and Mediation Analyses",
12644 636.0,
12645 648.0,
12646 ));
12647 doc.kids.push(make_paragraph(
12648 "Section 7.1: Mediation and Moderation Models 64",
12649 620.0,
12650 632.0,
12651 ));
12652 doc.kids
12653 .push(make_paragraph("References 101", 604.0, 616.0));
12654 doc.kids.push(make_paragraph(
12655 "Section 8.1: Factor Analysis Definitions 75",
12656 588.0,
12657 600.0,
12658 ));
12659
12660 let md = to_markdown(&doc).unwrap();
12661 assert!(md.contains(
12662 "# Part V. Chapter Five - Comparing Associations Between Multiple Variables\n\n## Section 5.1: The Linear Model"
12663 ));
12664 assert!(md.contains(
12665 "# Part VI. Chapter Six - Comparing Three or More Group Means\n\n## Section 6.1: Between Versus Within Group Analyses"
12666 ));
12667 assert!(md.contains("References 101\n\n## Section 8.1: Factor Analysis Definitions"));
12668 }
12669
12670 #[test]
12671 fn test_merged_caption_and_body_paragraph_renders_as_two_paragraphs() {
12672 let mut doc = PdfDocument::new("caption-body.pdf".to_string());
12673 doc.kids.push(make_paragraph(
12674 "Figure 1. This image shows the Western hemisphere as viewed from space 35,400 kilometers above Earth. (credit: modification of work by R. Stockli, NASA/ GSFC/ NOAA/ USGS) Our nearest astronomical neighbor is Earth's satellite, commonly called the Moon.",
12675 500.0,
12676 540.0,
12677 ));
12678
12679 let md = to_markdown(&doc).unwrap();
12680 assert!(md.contains("USGS)\n\nOur nearest astronomical neighbor"));
12681 }
12682
12683 #[test]
12684 fn test_short_caption_label_merges_with_following_tail_and_body() {
12685 let mut doc = PdfDocument::new("diagram-caption.pdf".to_string());
12686 doc.kids.push(make_paragraph("Diagram 5", 540.0, 552.0));
12687 doc.kids.push(make_paragraph(
12688 "Distribution of Komnas HAM's YouTube Content (2019- 2020) As of 1 December 2021, the channel has 2,290 subscribers and 185,676 total views.",
12689 520.0,
12690 532.0,
12691 ));
12692
12693 let md = to_markdown(&doc).unwrap();
12694 assert!(md.contains(
12695 "Diagram 5\nDistribution of Komnas HAM's YouTube Content (2019- 2020)\n\nAs of 1 December 2021, the channel has 2,290 subscribers"
12696 ));
12697 }
12698
12699 #[test]
12700 fn test_short_caption_label_merges_with_tail_and_year() {
12701 let mut doc = PdfDocument::new("figure-caption.pdf".to_string());
12702 doc.kids.push(make_paragraph("Figure 4", 540.0, 552.0));
12703 doc.kids.push(make_paragraph(
12704 "Komnas HAM's YouTube channel as of 1 December",
12705 520.0,
12706 532.0,
12707 ));
12708 doc.kids.push(make_paragraph("2021", 500.0, 512.0));
12709
12710 let md = to_markdown(&doc).unwrap();
12711 assert!(md.contains("Figure 4\nKomnas HAM's YouTube channel as of 1 December\n2021"));
12712 assert!(!md.contains("\n\n2021"));
12713 }
12714
12715 #[test]
12716 fn test_mid_page_numeric_labels_are_not_dropped_as_page_numbers() {
12717 let mut doc = PdfDocument::new("chart.pdf".to_string());
12718 doc.kids.push(make_paragraph("Figure 1", 760.0, 772.0));
12719 doc.kids.push(make_paragraph("100", 520.0, 528.0));
12720 doc.kids
12721 .push(make_paragraph("Body text continues here.", 400.0, 412.0));
12722 doc.kids.push(make_paragraph("36", 20.0, 28.0));
12723
12724 let md = to_markdown(&doc).unwrap();
12725 assert!(md.contains("100"));
12726 assert!(!md.lines().any(|line| line.trim() == "36"));
12727 }
12728
12729 #[test]
12730 fn test_semantic_paragraphs_are_not_remerged_in_markdown() {
12731 let mut doc = PdfDocument::new("paragraphs.pdf".to_string());
12732 doc.kids.push(make_paragraph(
12733 "First semantic paragraph ends here.",
12734 520.0,
12735 532.0,
12736 ));
12737 doc.kids.push(make_paragraph(
12738 "Second semantic paragraph starts here.",
12739 500.0,
12740 512.0,
12741 ));
12742
12743 let md = to_markdown(&doc).unwrap();
12744 assert!(md.contains(
12745 "First semantic paragraph ends here.\n\nSecond semantic paragraph starts here."
12746 ));
12747 }
12748
12749 #[test]
12750 fn test_lowercase_semantic_paragraph_continuation_is_merged() {
12751 let mut doc = PdfDocument::new("continuation.pdf".to_string());
12752 doc.kids.push(make_paragraph(
12753 "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference",
12754 520.0,
12755 532.0,
12756 ));
12757 doc.kids.push(make_paragraph("of interest.", 500.0, 512.0));
12758
12759 let md = to_markdown(&doc).unwrap();
12760 assert!(md.contains(
12761 "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference of interest."
12762 ));
12763 }
12764
12765 #[test]
12766 fn test_semantic_enumerated_paragraphs_are_not_merged() {
12767 let mut doc = PdfDocument::new("enumerated-paragraphs.pdf".to_string());
12768 doc.kids.push(make_paragraph(
12769 "iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
12770 520.0,
12771 532.0,
12772 ));
12773 doc.kids.push(make_paragraph(
12774 "iv. This business model will be operating cost-oriented not capital cost-oriented.",
12775 500.0,
12776 512.0,
12777 ));
12778
12779 let md = to_markdown(&doc).unwrap();
12780 assert!(md.contains(
12781 "iii. Looking at cost items, the cost of raw woods procurement will be highest share.\n\niv. This business model will be operating cost-oriented not capital cost-oriented."
12782 ));
12783 }
12784
12785 #[test]
12786 fn test_leading_figure_carryover_is_skipped_before_first_numbered_heading() {
12787 let mut doc = PdfDocument::new("leading-figure-carryover.pdf".to_string());
12788 doc.number_of_pages = 1;
12789 doc.kids.push(make_paragraph_at(
12790 72.0,
12791 742.0,
12792 540.0,
12793 756.0,
12794 "Figure 6. Mytella strigata biofouling green mussel farms in Bacoor City, Cavite, Manila Bay",
12795 ));
12796 doc.kids.push(make_heading_at(
12797 72.0,
12798 680.0,
12799 260.0,
12800 696.0,
12801 "5. Natural dispersal",
12802 ));
12803 doc.kids.push(make_paragraph_at(
12804 72.0,
12805 640.0,
12806 540.0,
12807 654.0,
12808 "Dispersal by purely natural means is not included as a pathway of biological invasions.",
12809 ));
12810
12811 let md = to_markdown(&doc).unwrap();
12812 assert!(md.starts_with("# 5. Natural dispersal"));
12813 assert!(!md.contains("Figure 6. Mytella strigata"));
12814 }
12815
12816 #[test]
12817 fn test_list_renderer_strips_duplicate_bullets_and_skips_bullet_only_items() {
12818 let mut doc = PdfDocument::new("bullets.pdf".to_string());
12819 doc.kids.push(make_fallback_list(&[
12820 "• First item",
12821 "•",
12822 "• Second item",
12823 "133",
12824 ]));
12825
12826 let md = to_markdown(&doc).unwrap();
12827 assert!(md.contains("- First item"));
12828 assert!(md.contains("- Second item"));
12829 assert!(!md.contains("- • First item"));
12830 assert!(!md.contains("\n- •\n"));
12831 assert!(!md.contains("\n- 133\n"));
12832 }
12833
12834 #[test]
12835 fn test_list_renderer_merges_wrapped_continuation_items() {
12836 let mut doc = PdfDocument::new("wrapped-list.pdf".to_string());
12837 doc.kids.push(make_fallback_list(&[
12838 "Use a micropipette to add 2 μL of loading dye",
12839 "and down a couple of times to mix the loading dye with the digested DNA.",
12840 "Use a fresh pipet tip for each reaction tube.",
12841 ]));
12842
12843 let md = to_markdown(&doc).unwrap();
12844 assert!(md.contains(
12845 "- Use a micropipette to add 2 μL of loading dye and down a couple of times to mix the loading dye with the digested DNA."
12846 ));
12847 assert!(md.contains("- Use a fresh pipet tip for each reaction tube."));
12848 assert!(!md.contains("\n- and down"));
12849 }
12850
12851 #[test]
12852 fn test_list_renderer_keeps_enumerated_items_separate() {
12853 let mut doc = PdfDocument::new("enumerated-list.pdf".to_string());
12854 doc.kids.push(make_fallback_list(&[
12855 "iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
12856 "iv. This business model will be operating cost-oriented not capital cost-oriented.",
12857 "v. Assumed selling price of wood pellet is $100 per tonne and appropriate.",
12858 ]));
12859
12860 let md = to_markdown(&doc).unwrap();
12861 assert!(md.contains("iii. Looking at cost items, the cost of raw woods procurement will be highest share.\niv. This business model will be operating cost-oriented not capital cost-oriented.\nv. Assumed selling price of wood pellet is $100 per tonne and appropriate."));
12862 assert!(!md.contains("- iii."));
12863 }
12864
12865 #[test]
12866 fn test_postprocess_drops_isolated_single_char_noise_lines() {
12867 let markdown = "# The Data Journey\n\n1\n\nTo get started.\n\no\n\nNOTE: Keep going.\n";
12868 let cleaned = drop_isolated_noise_lines(markdown);
12869 assert!(!cleaned.contains("\n1\n"));
12870 assert!(!cleaned.contains("\no\n"));
12871 assert!(cleaned.contains("To get started."));
12872 assert!(cleaned.contains("NOTE: Keep going."));
12873 }
12874
12875 fn make_two_column_table(rows: &[(&str, &str)]) -> ContentElement {
12876 let mut table_rows = Vec::new();
12877 for (row_number, (left, right)) in rows.iter().enumerate() {
12878 let top = 656.0 - row_number as f64 * 18.0;
12879 let bottom = top - 16.0;
12880 let mut cells = Vec::new();
12881 for (col_number, (text, left_x, right_x)) in
12882 [(*left, 72.0, 220.0), (*right, 220.0, 420.0)]
12883 .into_iter()
12884 .enumerate()
12885 {
12886 let content = if text.is_empty() {
12887 Vec::new()
12888 } else {
12889 vec![TableToken {
12890 base: TextChunk {
12891 value: text.to_string(),
12892 bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
12893 font_name: "Test".to_string(),
12894 font_size: 11.0,
12895 font_weight: 400.0,
12896 italic_angle: 0.0,
12897 font_color: "[0.0]".to_string(),
12898 contrast_ratio: 21.0,
12899 symbol_ends: Vec::new(),
12900 text_format: TextFormat::Normal,
12901 text_type: TextType::Regular,
12902 pdf_layer: PdfLayer::Main,
12903 ocg_visible: true,
12904 index: None,
12905 page_number: Some(1),
12906 level: None,
12907 mcid: None,
12908 },
12909 token_type: TableTokenType::Text,
12910 }]
12911 };
12912 cells.push(TableBorderCell {
12913 bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
12914 index: None,
12915 level: None,
12916 row_number,
12917 col_number,
12918 row_span: 1,
12919 col_span: 1,
12920 content,
12921 contents: vec![],
12922 semantic_type: None,
12923 });
12924 }
12925
12926 table_rows.push(TableBorderRow {
12927 bbox: BoundingBox::new(Some(1), 72.0, bottom, 420.0, top),
12928 index: None,
12929 level: None,
12930 row_number,
12931 cells,
12932 semantic_type: None,
12933 });
12934 }
12935
12936 ContentElement::TableBorder(TableBorder {
12937 bbox: BoundingBox::new(
12938 Some(1),
12939 72.0,
12940 656.0 - rows.len() as f64 * 18.0 - 16.0,
12941 420.0,
12942 656.0,
12943 ),
12944 index: None,
12945 level: Some("1".to_string()),
12946 x_coordinates: vec![72.0, 220.0, 420.0],
12947 x_widths: vec![0.0; 3],
12948 y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
12949 y_widths: vec![0.0; rows.len() + 1],
12950 rows: table_rows,
12951 num_rows: rows.len(),
12952 num_columns: 2,
12953 is_bad_table: false,
12954 is_table_transformer: false,
12955 previous_table: None,
12956 next_table: None,
12957 })
12958 }
12959
12960 fn make_chunked_paragraph_line(
12961 segments: &[(&str, f64, f64)],
12962 bottom: f64,
12963 top: f64,
12964 ) -> ContentElement {
12965 let bbox = BoundingBox::new(
12966 Some(1),
12967 segments.first().map(|(_, left, _)| *left).unwrap_or(72.0),
12968 bottom,
12969 segments.last().map(|(_, _, right)| *right).unwrap_or(320.0),
12970 top,
12971 );
12972
12973 let chunks = segments
12974 .iter()
12975 .map(|(text, left, right)| TextChunk {
12976 value: (*text).to_string(),
12977 bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
12978 font_name: "Lato-Regular".to_string(),
12979 font_size: top - bottom,
12980 font_weight: 400.0,
12981 italic_angle: 0.0,
12982 font_color: "#000000".to_string(),
12983 contrast_ratio: 21.0,
12984 symbol_ends: vec![],
12985 text_format: TextFormat::Normal,
12986 text_type: TextType::Regular,
12987 pdf_layer: PdfLayer::Main,
12988 ocg_visible: true,
12989 index: None,
12990 page_number: Some(1),
12991 level: None,
12992 mcid: None,
12993 })
12994 .collect::<Vec<_>>();
12995
12996 let line = TextLine {
12997 bbox: bbox.clone(),
12998 index: None,
12999 level: None,
13000 font_size: top - bottom,
13001 base_line: bottom + 2.0,
13002 slant_degree: 0.0,
13003 is_hidden_text: false,
13004 text_chunks: chunks,
13005 is_line_start: true,
13006 is_line_end: true,
13007 is_list_line: false,
13008 connected_line_art_label: None,
13009 };
13010 let block = TextBlock {
13011 bbox: bbox.clone(),
13012 index: None,
13013 level: None,
13014 font_size: line.font_size,
13015 base_line: line.base_line,
13016 slant_degree: 0.0,
13017 is_hidden_text: false,
13018 text_lines: vec![line],
13019 has_start_line: true,
13020 has_end_line: true,
13021 text_alignment: None,
13022 };
13023 let column = TextColumn {
13024 bbox: bbox.clone(),
13025 index: None,
13026 level: None,
13027 font_size: block.font_size,
13028 base_line: block.base_line,
13029 slant_degree: 0.0,
13030 is_hidden_text: false,
13031 text_blocks: vec![block],
13032 };
13033
13034 ContentElement::Paragraph(SemanticParagraph {
13035 base: SemanticTextNode {
13036 bbox,
13037 index: None,
13038 level: None,
13039 semantic_type: SemanticType::Paragraph,
13040 correct_semantic_score: None,
13041 columns: vec![column],
13042 font_weight: Some(400.0),
13043 font_size: Some(top - bottom),
13044 text_color: None,
13045 italic_angle: None,
13046 font_name: Some("Lato-Regular".to_string()),
13047 text_format: None,
13048 max_font_size: Some(top - bottom),
13049 background_color: None,
13050 is_hidden_text: false,
13051 },
13052 enclosed_top: false,
13053 enclosed_bottom: false,
13054 indentation: 0,
13055 })
13056 }
13057
13058 fn make_n_column_table(rows: &[Vec<&str>], column_bounds: &[(f64, f64)]) -> ContentElement {
13059 let mut table_rows = Vec::new();
13060 for (row_number, row_values) in rows.iter().enumerate() {
13061 let top = 656.0 - row_number as f64 * 18.0;
13062 let bottom = top - 16.0;
13063 let mut cells = Vec::new();
13064 for (col_number, (left_x, right_x)) in column_bounds.iter().enumerate() {
13065 let text = row_values.get(col_number).copied().unwrap_or("");
13066 let content = if text.is_empty() {
13067 Vec::new()
13068 } else {
13069 vec![TableToken {
13070 base: TextChunk {
13071 value: text.to_string(),
13072 bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
13073 font_name: "Test".to_string(),
13074 font_size: 11.0,
13075 font_weight: 400.0,
13076 italic_angle: 0.0,
13077 font_color: "[0.0]".to_string(),
13078 contrast_ratio: 21.0,
13079 symbol_ends: Vec::new(),
13080 text_format: TextFormat::Normal,
13081 text_type: TextType::Regular,
13082 pdf_layer: PdfLayer::Main,
13083 ocg_visible: true,
13084 index: None,
13085 page_number: Some(1),
13086 level: None,
13087 mcid: None,
13088 },
13089 token_type: TableTokenType::Text,
13090 }]
13091 };
13092 cells.push(TableBorderCell {
13093 bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
13094 index: None,
13095 level: None,
13096 row_number,
13097 col_number,
13098 row_span: 1,
13099 col_span: 1,
13100 content,
13101 contents: vec![],
13102 semantic_type: None,
13103 });
13104 }
13105
13106 table_rows.push(TableBorderRow {
13107 bbox: BoundingBox::new(
13108 Some(1),
13109 column_bounds.first().map(|(left, _)| *left).unwrap_or(72.0),
13110 bottom,
13111 column_bounds
13112 .last()
13113 .map(|(_, right)| *right)
13114 .unwrap_or(420.0),
13115 top,
13116 ),
13117 index: None,
13118 level: None,
13119 row_number,
13120 cells,
13121 semantic_type: None,
13122 });
13123 }
13124
13125 let left = column_bounds
13126 .first()
13127 .map(|(value, _)| *value)
13128 .unwrap_or(72.0);
13129 let right = column_bounds
13130 .last()
13131 .map(|(_, value)| *value)
13132 .unwrap_or(420.0);
13133 let x_coordinates = std::iter::once(left)
13134 .chain(column_bounds.iter().map(|(_, right)| *right))
13135 .collect::<Vec<_>>();
13136
13137 ContentElement::TableBorder(TableBorder {
13138 bbox: BoundingBox::new(
13139 Some(1),
13140 left,
13141 656.0 - rows.len() as f64 * 18.0 - 16.0,
13142 right,
13143 656.0,
13144 ),
13145 index: None,
13146 level: Some("1".to_string()),
13147 x_coordinates,
13148 x_widths: vec![0.0; column_bounds.len() + 1],
13149 y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
13150 y_widths: vec![0.0; rows.len() + 1],
13151 rows: table_rows,
13152 num_rows: rows.len(),
13153 num_columns: column_bounds.len(),
13154 is_bad_table: false,
13155 is_table_transformer: false,
13156 previous_table: None,
13157 next_table: None,
13158 })
13159 }
13160
13161 #[test]
13162 fn test_numeric_two_column_table_is_not_misrendered_as_toc() {
13163 let mut doc = PdfDocument::new("cec-table.pdf".to_string());
13164 doc.number_of_pages = 1;
13165 doc.kids.push(make_two_column_table(&[
13166 ("Mineral or colloid type", "CEC of pure colloid"),
13167 ("", "cmolc/kg"),
13168 ("kaolinite", "10"),
13169 ("illite", "30"),
13170 ]));
13171
13172 let md = to_markdown(&doc).unwrap();
13173 assert!(md.contains("| --- | --- |"));
13174 assert!(md.contains("| kaolinite | 10 |"));
13175 }
13176
13177 #[test]
13178 fn test_blank_right_column_table_is_not_misrendered_as_toc() {
13179 let mut doc = PdfDocument::new("flocculation-table.pdf".to_string());
13180 doc.number_of_pages = 1;
13181 doc.kids.push(make_two_column_table(&[
13182 (
13183 "Added cation",
13184 "Relative Size & Settling Rates of Floccules",
13185 ),
13186 ("K+", ""),
13187 ("Na+", ""),
13188 ("Ca2+", ""),
13189 ]));
13190
13191 let md = to_markdown(&doc).unwrap();
13192 assert!(md.contains("| Added cation | Relative Size & Settling Rates of Floccules |"));
13193 assert!(md.contains("| K+ | |"));
13194 }
13195
13196 #[test]
13197 fn test_infographic_card_table_renders_as_numbered_item() {
13198 let mut doc = PdfDocument::new("infographic-card.pdf".to_string());
13199 doc.number_of_pages = 1;
13200 doc.kids.push(make_two_column_table(&[
13201 (
13202 "1",
13203 "We're all both consumers and creators of creative work.",
13204 ),
13205 (
13206 "",
13207 "As consumers, we watch movies, listen to music, read books, and more.",
13208 ),
13209 ]));
13210
13211 let md = to_markdown(&doc).unwrap();
13212 assert!(md.contains(
13213 "1. We're all both consumers and creators of creative work. As consumers, we watch movies, listen to music, read books, and more."
13214 ));
13215 assert!(!md.contains("| 1 |"));
13216 }
13217
13218 #[test]
13219 fn test_grouped_header_rows_are_preserved_without_flattening() {
13220 let mut doc = PdfDocument::new("grouped-header.pdf".to_string());
13221 doc.number_of_pages = 1;
13222 doc.kids.push(make_n_column_table(
13223 &[
13224 vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
13225 vec![
13226 "",
13227 "Alpaca-GPT4",
13228 "OpenOrca",
13229 "Synth. Math-Instruct",
13230 "Orca DPO Pairs",
13231 "Ultrafeedback Cleaned",
13232 "Synth. Math-Alignment",
13233 ],
13234 vec![
13235 "Total # Samples",
13236 "52K",
13237 "2.91M",
13238 "126K",
13239 "12.9K",
13240 "60.8K",
13241 "126K",
13242 ],
13243 ],
13244 &[
13245 (72.0, 120.0),
13246 (120.0, 170.0),
13247 (170.0, 220.0),
13248 (220.0, 280.0),
13249 (280.0, 340.0),
13250 (340.0, 410.0),
13251 (410.0, 470.0),
13252 ],
13253 ));
13254
13255 let md = to_markdown(&doc).unwrap();
13256 assert!(md.contains(
13257 "| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"
13258 ));
13259 assert!(md.contains(
13260 "| | Alpaca-GPT4 | OpenOrca | Synth. Math-Instruct | Orca DPO Pairs | Ultrafeedback Cleaned | Synth. Math-Alignment |"
13261 ));
13262 assert!(!md.contains("Instruction OpenOrca"));
13263 assert!(!md.contains("Alignment Ultrafeedback"));
13264 }
13265
13266 #[test]
13267 fn test_top_table_plate_renderer_stops_before_article_body() {
13268 let mut doc = PdfDocument::new("table-plate.pdf".to_string());
13269 doc.number_of_pages = 1;
13270 doc.kids
13271 .push(make_paragraph_at(72.0, 724.0, 200.0, 736.0, "SOLAR 10.7B"));
13272 doc.kids.push(make_paragraph_at(
13273 72.0,
13274 704.0,
13275 220.0,
13276 716.0,
13277 "Training datasets",
13278 ));
13279 doc.kids.push(make_n_column_table(
13280 &[
13281 vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
13282 vec![
13283 "",
13284 "Alpaca-GPT4",
13285 "OpenOrca",
13286 "Synth. Math-Instruct",
13287 "Orca DPO Pairs",
13288 "Ultrafeedback Cleaned",
13289 "Synth. Math-Alignment",
13290 ],
13291 vec![
13292 "Total # Samples",
13293 "52K",
13294 "2.91M",
13295 "126K",
13296 "12.9K",
13297 "60.8K",
13298 "126K",
13299 ],
13300 vec![
13301 "Maximum # Samples Used",
13302 "52K",
13303 "100K",
13304 "52K",
13305 "12.9K",
13306 "60.8K",
13307 "20.1K",
13308 ],
13309 vec!["Open Source", "O", "O", "✗", "O", "O", "✗"],
13310 ],
13311 &[
13312 (78.0, 125.0),
13313 (125.0, 175.0),
13314 (175.0, 225.0),
13315 (225.0, 285.0),
13316 (285.0, 345.0),
13317 (345.0, 415.0),
13318 (415.0, 490.0),
13319 ],
13320 ));
13321 doc.kids.push(make_paragraph_at(
13322 72.0,
13323 500.0,
13324 310.0,
13325 514.0,
13326 "Table 1: Training datasets used for the instruction and alignment tuning stages, respectively.",
13327 ));
13328 doc.kids.push(make_paragraph_at(
13329 286.0,
13330 484.0,
13331 526.0,
13332 498.0,
13333 "Open source indicates whether the dataset is open-sourced.",
13334 ));
13335 doc.kids.push(make_paragraph_at(
13336 72.0,
13337 360.0,
13338 290.0,
13339 388.0,
13340 "Comparison to other up-scaling methods. Unlike Komatsuzaki et al. (2022)...",
13341 ));
13342
13343 let md = to_markdown(&doc).unwrap();
13344 assert!(md.contains("Table 1: Training datasets used for the instruction"));
13345 assert!(md.contains("| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"));
13346 assert!(!md.contains("Comparison to other up-scaling methods"));
13347 }
13348
13349 #[test]
13350 fn test_late_section_boundary_renderer_drops_equation_carryover() {
13351 let mut doc = PdfDocument::new("late-section.pdf".to_string());
13352 doc.number_of_pages = 1;
13353 doc.kids.push(make_paragraph_at(
13354 72.0,
13355 700.0,
13356 540.0,
13357 714.0,
13358 "The horizontal distance traveled by the jet is equal to:",
13359 ));
13360 doc.kids.push(make_paragraph_at(
13361 72.0,
13362 640.0,
13363 540.0,
13364 654.0,
13365 "The vertical position of the jet may be calculated as:",
13366 ));
13367 doc.kids.push(make_paragraph_at(
13368 72.0,
13369 580.0,
13370 260.0,
13371 594.0,
13372 "Rearranging Equation (8) gives:",
13373 ));
13374 doc.kids.push(make_paragraph_at(
13375 72.0,
13376 520.0,
13377 420.0,
13378 534.0,
13379 "Substitution into Equation 7 results in:",
13380 ));
13381 doc.kids.push(make_paragraph_at(
13382 72.0,
13383 460.0,
13384 280.0,
13385 474.0,
13386 "Equations (10) can be rearranged to find Cv:",
13387 ));
13388 doc.kids.push(make_heading_at(
13389 72.0,
13390 350.0,
13391 420.0,
13392 366.0,
13393 "7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE",
13394 ));
13395 doc.kids.push(make_paragraph_at(
13396 72.0,
13397 326.0,
13398 380.0,
13399 340.0,
13400 "If C_d is assumed to be constant, then a graph of Q plotted against",
13401 ));
13402 doc.kids.push(make_paragraph_at(
13403 400.0,
13404 326.0,
13405 540.0,
13406 340.0,
13407 "(Equation 6) will be linear, and",
13408 ));
13409 doc.kids.push(make_paragraph_at(
13410 72.0,
13411 310.0,
13412 240.0,
13413 324.0,
13414 "the slope of this graph will be:",
13415 ));
13416 doc.kids.push(make_paragraph_at(
13417 360.0,
13418 36.0,
13419 550.0,
13420 48.0,
13421 "EXPERIMENT #6: ORIFICE AND FREE JET FLOW 53",
13422 ));
13423
13424 let md = to_markdown(&doc).unwrap();
13425 assert!(md.starts_with("# 7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE"));
13426 assert!(md.contains(
13427 "If C_d is assumed to be constant, then a graph of Q plotted against (Equation 6) will be linear, and the slope of this graph will be:"
13428 ));
13429 assert!(!md.contains("The horizontal distance traveled by the jet"));
13430 assert!(!md.contains("EXPERIMENT #6"));
13431 }
13432
13433 #[test]
13434 fn test_leading_table_carryover_row_is_trimmed_from_general_renderer() {
13435 let mut doc = PdfDocument::new("carryover-table.pdf".to_string());
13436 doc.number_of_pages = 1;
13437 doc.kids.push(make_n_column_table(
13438 &[
13439 vec![
13440 "Jurisdiction",
13441 "GATS XVII Reservation (1994)",
13442 "Foreign Ownership Permitted",
13443 "Restrictions on Foreign Ownership",
13444 "Foreign Ownership Reporting Requirements",
13445 ],
13446 vec![
13447 "",
13448 "",
13449 "",
13450 "right required to acquire desert lands and continue the prior page",
13451 "",
13452 ],
13453 vec!["Finland", "N", "Y", "Prior approval may be required.", ""],
13454 vec!["France", "N", "Y", "None.", ""],
13455 ],
13456 &[
13457 (72.0, 150.0),
13458 (150.0, 235.0),
13459 (235.0, 330.0),
13460 (330.0, 500.0),
13461 (500.0, 560.0),
13462 ],
13463 ));
13464
13465 let md = to_markdown(&doc).unwrap();
13466 assert!(!md.contains("right required to acquire desert lands"));
13467 assert!(md.contains("| Finland | N | Y | Prior approval may be required. | |"));
13468 }
13469
13470 #[test]
13471 fn test_single_table_report_renderer_promotes_title_and_skips_footer() {
13472 let mut doc = PdfDocument::new("single-table-report.pdf".to_string());
13473 doc.number_of_pages = 1;
13474 doc.kids.push(make_paragraph_at(
13475 140.0,
13476 674.0,
13477 474.0,
13478 688.0,
13479 "Restrictions on Land Ownership by Foreigners in Selected Jurisdictions",
13480 ));
13481 doc.kids.push(make_n_column_table(
13482 &[
13483 vec![
13484 "Jurisdiction",
13485 "GATS XVII Reservation (1994)",
13486 "Foreign Ownership Permitted",
13487 "Restrictions on Foreign Ownership",
13488 "Foreign Ownership Reporting Requirements",
13489 ],
13490 vec![
13491 "",
13492 "",
13493 "",
13494 "right required to acquire desert lands and continue the prior page",
13495 "",
13496 ],
13497 vec![
13498 "Finland",
13499 "N",
13500 "Y",
13501 "Prior approval from the Government of Aland may be required.",
13502 "",
13503 ],
13504 vec!["France", "N", "Y", "None.", ""],
13505 ],
13506 &[
13507 (72.0, 150.0),
13508 (150.0, 235.0),
13509 (235.0, 330.0),
13510 (330.0, 500.0),
13511 (500.0, 560.0),
13512 ],
13513 ));
13514 doc.kids.push(make_paragraph_at(
13515 350.0,
13516 36.0,
13517 548.0,
13518 48.0,
13519 "The Law Library of Congress 7",
13520 ));
13521
13522 let md = to_markdown(&doc).unwrap();
13523 assert!(md.starts_with(
13524 "# Restrictions on Land Ownership by Foreigners in Selected Jurisdictions"
13525 ));
13526 assert!(!md.contains("right required to acquire desert lands"));
13527 assert!(!md.contains("The Law Library of Congress 7"));
13528 assert!(md.contains(
13529 "| Finland | N | Y | Prior approval from the Government of Aland may be required. | |"
13530 ));
13531 }
13532
13533 #[test]
13534 fn test_hyphenated_table_title_continuation_renders_as_heading() {
13535 let mut doc = PdfDocument::new("hyphenated-table-title.pdf".to_string());
13536 doc.number_of_pages = 1;
13537 doc.kids.push(make_paragraph_at(
13538 72.0,
13539 724.0,
13540 520.0,
13541 738.0,
13542 "With this in mind, here we have the 7 key competence areas selected to form a part of Eco-",
13543 ));
13544 doc.kids.push(make_paragraph_at(
13545 72.0,
13546 704.0,
13547 260.0,
13548 718.0,
13549 "Circle's Competence Framework:",
13550 ));
13551 doc.kids.push(make_n_column_table(
13552 &[
13553 vec!["Eco-Circle Competence Framework"],
13554 vec!["#1: The 3 Rs: Recycle-Reuse-Reduce"],
13555 vec!["#2: Lifecycle of Circular Economy"],
13556 ],
13557 &[(140.0, 460.0)],
13558 ));
13559
13560 let md = to_markdown(&doc).unwrap();
13561 assert!(md.contains("# Circle's Competence Framework:"), "{md}");
13562 }
13563
13564 #[test]
13565 fn test_duplicate_table_header_heading_is_demoted() {
13566 let mut doc = PdfDocument::new("duplicate-table-header-heading.pdf".to_string());
13567 doc.number_of_pages = 1;
13568 doc.kids
13569 .push(make_heading("MOHAVE COMMUNITY COLLEGE BIO181"));
13570 doc.kids.push(make_n_column_table(
13571 &[
13572 vec![
13573 "",
13574 "Saccharometer",
13575 "DI Water",
13576 "Glucose Solution",
13577 "Yeast Suspension",
13578 ],
13579 vec!["1", "", "8 ml", "6 ml", "0 ml"],
13580 vec!["2", "", "12 ml", "0 ml", "2 ml"],
13581 vec!["3", "", "6 ml", "6 ml", "2 ml"],
13582 ],
13583 &[
13584 (72.0, 110.0),
13585 (110.0, 210.0),
13586 (210.0, 300.0),
13587 (300.0, 430.0),
13588 (430.0, 540.0),
13589 ],
13590 ));
13591 doc.kids.push(make_heading_at(
13592 72.0,
13593 92.0,
13594 390.0,
13595 108.0,
13596 "Saccharometer DI Water Glucose Solution Yeast Suspension",
13597 ));
13598 doc.kids
13599 .push(make_paragraph_at(72.0, 72.0, 120.0, 88.0, "below"));
13600 doc.kids
13601 .push(make_paragraph_at(72.0, 56.0, 240.0, 72.0, "1 16 ml 12 ml"));
13602 doc.kids
13603 .push(make_paragraph_at(296.0, 56.0, 340.0, 72.0, "0 ml"));
13604
13605 let md = to_markdown(&doc).unwrap();
13606 assert!(
13607 md.contains("Saccharometer DI Water Glucose Solution Yeast Suspension"),
13608 "{md}"
13609 );
13610 assert!(
13611 !md.contains("# Saccharometer DI Water Glucose Solution Yeast Suspension"),
13612 "{md}"
13613 );
13614 }
13615
13616 #[test]
13617 fn test_geometric_panel_headers_are_promoted_into_table() {
13618 let mut doc = PdfDocument::new("ai-pack-panel.pdf".to_string());
13619 doc.kids.push(make_chunked_paragraph_line(
13620 &[("OCR", 220.0, 250.0)],
13621 720.0,
13622 732.0,
13623 ));
13624 doc.kids.push(make_chunked_paragraph_line(
13625 &[("Recommendation", 430.0, 540.0)],
13626 720.0,
13627 732.0,
13628 ));
13629 doc.kids.push(make_chunked_paragraph_line(
13630 &[("Product semantic search", 660.0, 860.0)],
13631 720.0,
13632 732.0,
13633 ));
13634 doc.kids.push(make_chunked_paragraph_line(
13635 &[("Pack", 72.0, 110.0)],
13636 684.0,
13637 696.0,
13638 ));
13639 doc.kids.push(make_chunked_paragraph_line(
13640 &[("A solution that recognizes characters", 140.0, 340.0)],
13641 684.0,
13642 696.0,
13643 ));
13644 doc.kids.push(make_chunked_paragraph_line(
13645 &[("A solution that recommends the best products", 390.0, 620.0)],
13646 684.0,
13647 696.0,
13648 ));
13649 doc.kids.push(make_chunked_paragraph_line(
13650 &[("A solution that enables semantic search", 650.0, 900.0)],
13651 684.0,
13652 696.0,
13653 ));
13654 doc.kids.push(make_n_column_table(
13655 &[
13656 vec![
13657 "Achieved 1st place in the OCR World Competition",
13658 "Team with specialists and technologies",
13659 "Creation of the first natural language evaluation",
13660 ],
13661 vec![
13662 "The team includes specialists who have",
13663 "received Kaggle's Gold Medal recommendation",
13664 "system in Korean (KLUE)",
13665 ],
13666 vec![
13667 "presented 14 papers in renowned AI conferences",
13668 "top-tier recommendation",
13669 "Shopee subject",
13670 ],
13671 ],
13672 &[(120.0, 360.0), (360.0, 630.0), (630.0, 910.0)],
13673 ));
13674 doc.kids.push(make_chunked_paragraph_line(
13675 &[("models", 430.0, 490.0)],
13676 552.0,
13677 564.0,
13678 ));
13679
13680 let md = to_markdown(&doc).unwrap();
13681 assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
13682 assert!(md.contains("| A solution that recognizes characters | A solution that recommends the best products | A solution that enables semantic search |"));
13683 assert!(md.contains(
13684 "received Kaggle's Gold Medal recommendation top-tier recommendation models"
13685 ));
13686 }
13687
13688 #[test]
13689 fn test_embedded_stub_header_is_promoted_from_first_table_column() {
13690 let mut doc = PdfDocument::new("embedded-stub-header.pdf".to_string());
13691 doc.kids.push(make_chunked_paragraph_line(
13692 &[("OCR", 220.0, 250.0)],
13693 720.0,
13694 732.0,
13695 ));
13696 doc.kids.push(make_chunked_paragraph_line(
13697 &[("Recommendation", 430.0, 540.0)],
13698 720.0,
13699 732.0,
13700 ));
13701 doc.kids.push(make_chunked_paragraph_line(
13702 &[("Product semantic search", 660.0, 860.0)],
13703 720.0,
13704 732.0,
13705 ));
13706 doc.kids.push(make_n_column_table(
13707 &[
13708 vec![
13709 "Pack",
13710 "A solution that recognizes characters in an image and extracts necessary information",
13711 "A solution that recommends the best products and contents",
13712 "A solution that enables semantic search and organizes key information",
13713 ],
13714 vec![
13715 "Application",
13716 "Applicable to all fields that require text extraction",
13717 "Applicable to all fields that use any form of recommendation",
13718 "Applicable to all fields that deal with unstructured data",
13719 ],
13720 vec![
13721 "Highlight",
13722 "Achieved 1st place in the OCR World Competition",
13723 "Received Kaggle's Gold Medal recommendation",
13724 "Creation of the first natural language evaluation system in Korean",
13725 ],
13726 ],
13727 &[
13728 (72.0, 120.0),
13729 (120.0, 360.0),
13730 (360.0, 630.0),
13731 (630.0, 910.0),
13732 ],
13733 ));
13734
13735 let md = to_markdown(&doc).unwrap();
13736 assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
13737 assert!(
13738 md.contains("| Application | Applicable to all fields that require text extraction |")
13739 );
13740 assert!(md.contains("| Highlight | Achieved 1st place in the OCR World Competition |"));
13741 assert!(!md.contains("OCR\n\nRecommendation\n\nProduct semantic search"));
13742 }
13743
13744 #[test]
13745 fn test_geometric_chunk_alignment_splits_header_line_into_columns() {
13746 let line = make_chunked_paragraph_line(
13747 &[
13748 ("Properties", 72.0, 145.0),
13749 ("Instruction", 180.0, 255.0),
13750 ("Alignment", 480.0, 545.0),
13751 ],
13752 720.0,
13753 732.0,
13754 );
13755 let chunk_lines = extract_chunk_lines(&line);
13756 let fragments = split_line_into_slot_fragments(
13757 &chunk_lines[0],
13758 &[
13759 (72.0, 170.0),
13760 (170.0, 280.0),
13761 (280.0, 380.0),
13762 (380.0, 480.0),
13763 (480.0, 600.0),
13764 (600.0, 720.0),
13765 (720.0, 850.0),
13766 ],
13767 );
13768
13769 assert_eq!(fragments.len(), 3);
13770 assert_eq!(fragments[0].slot_idx, 0);
13771 assert_eq!(fragments[0].text, "Properties");
13772 assert_eq!(fragments[1].slot_idx, 1);
13773 assert_eq!(fragments[1].text, "Instruction");
13774 assert_eq!(fragments[2].slot_idx, 4);
13775 assert_eq!(fragments[2].text, "Alignment");
13776 }
13777
13778 #[test]
13779 fn test_merge_tables_across_heading() {
13780 let input = "some text\n\n\
13781 | Area | Competence |\n\
13782 | --- | --- |\n\
13783 | Row1 | Val1 |\n\
13784 | Row2 | Val2 |\n\
13785 \n\
13786 # Heading Between\n\
13787 \n\
13788 | Row3 | Val3 |\n\
13789 | --- | --- |\n\
13790 \n\
13791 more text\n";
13792 let result = merge_adjacent_pipe_tables(input);
13793 assert!(
13795 result.contains("| Heading Between |"),
13796 "Heading should be in pipe row: {}",
13797 result
13798 );
13799 assert!(
13801 !result.contains("# Heading Between"),
13802 "Heading marker should be removed: {}",
13803 result
13804 );
13805 assert!(
13807 result.contains("| Row3 |") || result.contains("Row3"),
13808 "Row3 should exist: {}",
13809 result
13810 );
13811 }
13812
13813 #[test]
13814 fn test_merge_tables_does_not_cross_distinct_headers() {
13815 let input = "| Model | Score |\n\
13816 | --- | --- |\n\
13817 | A | 1 |\n\
13818 \n\
13819 Table 6: Performance comparison amongst the merge candidates.\n\
13820 \n\
13821 | Model | Method | Score |\n\
13822 | --- | --- | --- |\n\
13823 | B | Avg | 2 |\n";
13824 let result = merge_adjacent_pipe_tables(input);
13825
13826 assert!(result.contains("Table 6: Performance comparison amongst the merge candidates."));
13827 assert!(result.contains("| Model | Score |"));
13828 assert!(result.contains("| Model | Method | Score |"));
13829 assert!(
13830 !result.contains("| Table 6: Performance comparison amongst the merge candidates. |")
13831 );
13832 }
13833
13834 #[test]
13835 fn test_normalize_chart_like_markdown_extracts_series_tables() {
13836 let input = "Figure 1.7. Non-citizen population in Malaysia (in thousands) 3,323 3,500 3,288 3,230 3,140 2,907 3,000 2,693 2,500 2,000 1,500 1,000 500 0\n\n\
13837 2016 2017 2018 2019 2020 2021 Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.\n\n\
13838 ASEAN Migration Outlook 19\n";
13839
13840 let normalized = normalize_chart_like_markdown(input);
13841 assert!(
13842 normalized.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
13843 );
13844 assert!(normalized.contains("| 2016 | 3,323 |"));
13845 assert!(normalized.contains("| 2021 | 2,693 |"));
13846 assert!(normalized.contains(
13847 "*Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.*"
13848 ));
13849 assert!(!normalized.contains("ASEAN Migration Outlook 19"));
13850 }
13851
13852 #[test]
13853 fn test_normalize_chart_like_markdown_promotes_structural_captions() {
13854 let input = "Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or\n\n\
13855 The Wonderful Lamp.\n\n\
13856 Body paragraph.\n";
13857
13858 let normalized = normalize_chart_like_markdown(input);
13859 assert!(normalized.contains(
13860 "## Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or The Wonderful Lamp"
13861 ));
13862 assert!(normalized.contains("Body paragraph."));
13863 }
13864
13865 #[test]
13866 fn test_normalize_chart_like_markdown_reconstructs_header_pair_chart_table() {
13867 let input = "Figure 4.8. Domestic Wood Pellets Production\n\n\
13868 | 8 | 800 200 | 126 2014 | 120 2015 | 120 2016 | 127 2017 | 131 2018 | 147 2019 |\n\
13869 | --- | --- | --- | --- | --- | --- | --- | --- |\n\n\
13870 Source: Forestry Agency, Ministry of Agriculture, Forestry and Fishery (MAFF), 2020.\n";
13871
13872 let normalized = normalize_chart_like_markdown(input);
13873 assert!(normalized.contains("# Figure 4.8. Domestic Wood Pellets Production"));
13874 assert!(normalized.contains("| Year | Domestic Wood Pellets Production |"));
13875 assert!(normalized.contains("| 2014 | 126 |"));
13876 assert!(normalized.contains("| 2019 | 147 |"));
13877 assert!(!normalized.contains("| 8 | 800 200 |"));
13878 }
13879
13880 #[test]
13881 fn test_normalize_chart_like_markdown_drops_numeric_axis_artifact_table() {
13882 let input = "| 31 1 0 2 23 2 2 2 0 5 10 15 20 25 30 35 Event Celebration Information Videograph 2019 2020 |\n\
13883 | --- |\n\n\
13884 Distribution of Komnas HAM's YouTube Content (2019-2020)\n";
13885
13886 let normalized = normalize_chart_like_markdown(input);
13887 assert!(!normalized.contains("| --- |"));
13888 assert!(normalized.contains("Distribution of Komnas HAM's YouTube Content (2019-2020)"));
13889 }
13890
13891 #[test]
13892 fn test_normalize_chart_like_markdown_drops_url_fragment_table() {
13893 let input = "## Figure 6 DPN Argentina Content: World Health Day Celebration\n\n\
13894 | na/status/1379765916259483648 |\n\
13895 | --- |\n\n\
13896 98 DPN Argentina, accessed on 5 December 2021.\n";
13897
13898 let normalized = normalize_chart_like_markdown(input);
13899 assert!(!normalized.contains("/status/1379765916259483648 |"));
13900 assert!(normalized.contains("98 DPN Argentina, accessed on 5 December 2021."));
13901 }
13902
13903 #[test]
13904 fn test_normalize_chart_like_markdown_drops_sparse_table_before_caption() {
13905 let input = "What’s unique about the growth of Alligator Gars is their fast growth.\n\n\
13906 | in | cm | | Length | of | Gar | Fish | Age |\n\
13907 | --- | --- | --- | --- | --- | --- | --- | --- |\n\
13908 | 120) | 300 | | | | | | |\n\
13909 | 100+ | 250 | | | | | | |\n\
13910 | 80+ | 200 | | | | | | |\n\
13911 | 20. | 50 | G | | | | | Vi |\n\
13912 | 0 | 0 | | | | | | |\n\
13913 | | 0 | 10 | 30 | | 40 | 50 | 60 |\n\n\
13914 Figure 8.6: Growth in length of Alligator Gar in Texas.\n";
13915
13916 let normalized = normalize_chart_like_markdown(input);
13917 assert!(!normalized.contains("| in | cm |"));
13918 assert!(normalized.contains("Figure 8.6: Growth in length of Alligator Gar in Texas."));
13919 }
13920
13921 #[test]
13922 fn test_normalize_chart_like_markdown_trims_large_top_table_plate() {
13923 let input = "| A | B | C | D | E | F | G | H |\n\
13924 | --- | --- | --- | --- | --- | --- | --- | --- |\n\
13925 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13926 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13927 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13928 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13929 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13930 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13931 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13932 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\n\
13933 Table 2: Evaluation results for SOLAR 10.7B and SOLAR 10.7B-Instruct along with other top-performing models in the paper.\n\n\
13934 # 4.2 Main Results\n\n\
13935 The surrounding prose should be dropped.\n";
13936
13937 let normalized = normalize_chart_like_markdown(input);
13938 assert!(normalized.starts_with("| A | B | C | D | E | F | G | H |"));
13939 assert!(!normalized.contains("Table 2:"));
13940 assert!(!normalized.contains("4.2 Main Results"));
13941 assert!(!normalized.contains("surrounding prose"));
13942 }
13943}