1#[cfg(not(target_arch = "wasm32"))]
4use regex::Regex;
5use std::collections::{HashMap, HashSet};
6#[cfg(not(target_arch = "wasm32"))]
7use std::path::Path;
8#[cfg(not(target_arch = "wasm32"))]
9use std::process::Command;
10
11use crate::models::bbox::BoundingBox;
12use crate::models::chunks::TextChunk;
13use crate::models::content::ContentElement;
14use crate::models::document::PdfDocument;
15use crate::models::enums::SemanticType;
16use crate::models::semantic::SemanticTextNode;
17use crate::models::table::TableTokenRow;
18use crate::EdgePdfError;
19
20#[cfg(not(target_arch = "wasm32"))]
21struct CachedBBoxLayout {
22 page_width: f64,
23 lines: Vec<BBoxLayoutLine>,
24 blocks: Vec<BBoxLayoutBlock>,
25}
26
27#[cfg(not(target_arch = "wasm32"))]
28#[derive(Default)]
29struct LayoutSourceCache {
30 bbox_layout: Option<Option<CachedBBoxLayout>>,
31 layout_lines: Option<Option<Vec<String>>>,
32}
33
34#[cfg(not(target_arch = "wasm32"))]
35impl LayoutSourceCache {
36 fn bbox_layout(&mut self, doc: &PdfDocument) -> Option<&CachedBBoxLayout> {
37 if self.bbox_layout.is_none() {
38 let loaded = doc.source_path.as_deref().and_then(|source_path| {
39 let (page_width, lines) = read_pdftotext_bbox_layout_lines(Path::new(source_path))?;
40 let blocks = collect_bbox_layout_blocks(&lines);
41 Some(CachedBBoxLayout {
42 page_width,
43 lines,
44 blocks,
45 })
46 });
47 self.bbox_layout = Some(loaded);
48 }
49 self.bbox_layout.as_ref().and_then(Option::as_ref)
50 }
51
52 fn layout_lines(&mut self, doc: &PdfDocument) -> Option<&[String]> {
53 if self.layout_lines.is_none() {
54 let loaded = doc
55 .source_path
56 .as_deref()
57 .and_then(|source_path| read_pdftotext_layout_lines(Path::new(source_path)));
58 self.layout_lines = Some(loaded);
59 }
60 self.layout_lines
61 .as_ref()
62 .and_then(Option::as_ref)
63 .map(Vec::as_slice)
64 }
65}
66
67pub fn to_markdown(doc: &PdfDocument) -> Result<String, EdgePdfError> {
72 #[cfg(not(target_arch = "wasm32"))]
73 let mut layout_cache = LayoutSourceCache::default();
74 #[cfg(not(target_arch = "wasm32"))]
75 if let Some(rendered) = render_layout_open_plate_document_cached(doc, &mut layout_cache) {
76 return Ok(rendered);
77 }
78 #[cfg(not(target_arch = "wasm32"))]
79 if let Some(rendered) =
80 render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
81 {
82 return Ok(rendered);
83 }
84 #[cfg(not(target_arch = "wasm32"))]
85 if let Some(rendered) = render_layout_captioned_media_document_cached(doc, &mut layout_cache) {
86 return Ok(rendered);
87 }
88 #[cfg(not(target_arch = "wasm32"))]
89 if let Some(rendered) =
90 render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
91 {
92 return Ok(rendered);
93 }
94 #[cfg(not(target_arch = "wasm32"))]
95 if let Some(rendered) = render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
96 {
97 return Ok(rendered);
98 }
99 #[cfg(not(target_arch = "wasm32"))]
100 if let Some(rendered) = render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
101 {
102 return Ok(rendered);
103 }
104 #[cfg(not(target_arch = "wasm32"))]
105 if let Some(rendered) =
106 render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
107 {
108 return Ok(rendered);
109 }
110 #[cfg(not(target_arch = "wasm32"))]
111 if let Some(rendered) = render_layout_toc_document_cached(doc, &mut layout_cache) {
112 return Ok(rendered);
113 }
114 if looks_like_contents_document(doc) {
115 return Ok(render_contents_document(doc));
116 }
117 if looks_like_compact_toc_document(doc) {
118 return Ok(render_compact_toc_document(doc));
119 }
120 #[cfg(not(target_arch = "wasm32"))]
121 if let Some(rendered) = render_layout_projection_sheet_document_cached(doc, &mut layout_cache) {
122 return Ok(rendered);
123 }
124 #[cfg(not(target_arch = "wasm32"))]
125 if let Some(rendered) = render_layout_appendix_tables_document_cached(doc, &mut layout_cache) {
126 return Ok(rendered);
127 }
128 #[cfg(not(target_arch = "wasm32"))]
129 if let Some(rendered) = render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
130 {
131 return Ok(rendered);
132 }
133 #[cfg(not(target_arch = "wasm32"))]
134 if let Some(rendered) = render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
135 {
136 return Ok(rendered);
137 }
138 #[cfg(not(target_arch = "wasm32"))]
139 if let Some(rendered) =
140 render_layout_registration_report_document_cached(doc, &mut layout_cache)
141 {
142 return Ok(rendered);
143 }
144 if let Some(rendered) = render_top_table_plate_document(doc) {
145 return Ok(rendered);
146 }
147 if let Some(rendered) = render_single_table_report_document(doc) {
148 return Ok(rendered);
149 }
150 if let Some(rendered) = render_late_section_boundary_document(doc) {
151 return Ok(rendered);
152 }
153 #[cfg(not(target_arch = "wasm32"))]
154 if let Some(rendered) = render_layout_matrix_document_cached(doc, &mut layout_cache) {
155 return Ok(rendered);
156 }
157 #[cfg(not(target_arch = "wasm32"))]
158 if let Some(rendered) = render_layout_panel_stub_document_cached(doc, &mut layout_cache) {
159 return Ok(rendered);
160 }
161
162 Ok(render_markdown_core(doc))
163}
164
165fn render_markdown_core(doc: &PdfDocument) -> String {
166 let mut output = String::new();
167
168 if let Some(ref title) = doc.title {
170 let trimmed = title.trim();
171 if !trimmed.is_empty() && !should_skip_document_title(doc, trimmed) {
172 if should_render_document_title_as_plaintext(doc, trimmed) {
173 output.push_str(trimmed);
174 output.push_str("\n\n");
175 } else {
176 output.push_str(&format!("# {}\n\n", trimmed));
177 }
178 }
179 }
180
181 if doc.kids.is_empty() {
182 output.push_str("*No content extracted.*\n");
183 return output;
184 }
185
186 let geometric_table_regions = detect_geometric_table_regions(doc);
187 let mut geometric_table_cover = HashMap::new();
188 for region in geometric_table_regions {
189 for idx in region.start_idx..=region.end_idx {
190 geometric_table_cover.insert(idx, region.clone());
191 }
192 }
193
194 let mut i = 0usize;
195 while i < doc.kids.len() {
196 if let Some(region) = geometric_table_cover.get(&i) {
197 output.push_str(®ion.rendered);
198 i = region.end_idx + 1;
199 continue;
200 }
201
202 match &doc.kids[i] {
203 ContentElement::Heading(h) => {
204 let text = h.base.base.value();
205 let trimmed = text.trim();
206 if trimmed.is_empty() || should_skip_heading_text(trimmed) {
207 i += 1;
208 continue;
209 }
210
211 if looks_like_table_header_duplicate_heading(doc, i, trimmed) {
214 output.push_str(&escape_md_line_start(trimmed));
215 output.push_str("\n\n");
216 i += 1;
217 continue;
218 }
219
220 if looks_like_bottom_margin_heading(doc, i) {
223 output.push_str(&escape_md_line_start(trimmed));
224 output.push_str("\n\n");
225 i += 1;
226 continue;
227 }
228
229 if should_demote_period_heading(trimmed) {
232 output.push_str(&escape_md_line_start(trimmed));
233 output.push_str("\n\n");
234 i += 1;
235 continue;
236 }
237
238 if should_demote_comma_heading(trimmed) {
240 output.push_str(&escape_md_line_start(trimmed));
241 output.push_str("\n\n");
242 i += 1;
243 continue;
244 }
245
246 if should_demote_math_heading(trimmed) {
248 output.push_str(&escape_md_line_start(trimmed));
249 output.push_str("\n\n");
250 i += 1;
251 continue;
252 }
253
254 if should_demote_percentage_heading(trimmed) {
256 output.push_str(&escape_md_line_start(trimmed));
257 output.push_str("\n\n");
258 i += 1;
259 continue;
260 }
261
262 if starts_with_caption_prefix(trimmed) {
266 output.push_str(&escape_md_line_start(trimmed));
267 output.push_str("\n\n");
268 i += 1;
269 continue;
270 }
271
272 if should_demote_bibliography_heading(trimmed) {
275 output.push_str(&escape_md_line_start(trimmed));
276 output.push_str("\n\n");
277 i += 1;
278 continue;
279 }
280
281 if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
282 if should_demote_heading_to_paragraph(trimmed, &next_text) {
283 let mut merged = trimmed.to_string();
284 merge_paragraph_text(&mut merged, &next_text);
285 output.push_str(&escape_md_line_start(merged.trim()));
286 output.push_str("\n\n");
287 i += 2;
288 continue;
289 }
290 }
291
292 let mut merged_heading = trimmed.to_string();
296 while let Some(ContentElement::Heading(next_h)) = doc.kids.get(i + 1) {
297 let next_text = next_h.base.base.value();
298 let next_trimmed = next_text.trim();
299 if next_trimmed.is_empty() || should_skip_heading_text(next_trimmed) {
300 i += 1;
301 continue;
302 }
303 if merged_heading.len() + 1 + next_trimmed.len() > 200 {
305 break;
306 }
307 merge_paragraph_text(&mut merged_heading, next_trimmed);
308 i += 1;
309 }
310
311 let cleaned_heading = strip_trailing_page_number(merged_heading.trim());
312
313 if let Some(split_pos) = find_merged_subsection_split(cleaned_heading) {
315 let first = cleaned_heading[..split_pos].trim();
316 let second = cleaned_heading[split_pos..].trim();
317 output.push_str(&format!("# {}\n\n", first));
318 output.push_str(&format!("# {}\n\n", second));
319 } else {
320 output.push_str(&format!("# {}\n\n", cleaned_heading));
321 }
322 }
323 ContentElement::NumberHeading(nh) => {
324 let text = nh.base.base.base.value();
325 let trimmed = text.trim();
326 if trimmed.is_empty() || should_skip_heading_text(trimmed) {
327 i += 1;
328 continue;
329 }
330
331 if should_demote_comma_heading(trimmed) {
333 output.push_str(&escape_md_line_start(trimmed));
334 output.push_str("\n\n");
335 i += 1;
336 continue;
337 }
338
339 if should_demote_math_heading(trimmed) {
341 output.push_str(&escape_md_line_start(trimmed));
342 output.push_str("\n\n");
343 i += 1;
344 continue;
345 }
346
347 if should_demote_percentage_heading(trimmed) {
349 output.push_str(&escape_md_line_start(trimmed));
350 output.push_str("\n\n");
351 i += 1;
352 continue;
353 }
354
355 if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
356 if should_demote_heading_to_paragraph(trimmed, &next_text) {
357 let mut merged = trimmed.to_string();
358 merge_paragraph_text(&mut merged, &next_text);
359 output.push_str(&escape_md_line_start(merged.trim()));
360 output.push_str("\n\n");
361 i += 2;
362 continue;
363 }
364 }
365
366 let cleaned = strip_trailing_page_number(trimmed);
367
368 if let Some(split_pos) = find_merged_subsection_split(cleaned) {
370 let first = cleaned[..split_pos].trim();
371 let second = cleaned[split_pos..].trim();
372 output.push_str(&format!("# {}\n\n", first));
373 output.push_str(&format!("# {}\n\n", second));
374 } else {
375 output.push_str(&format!("# {}\n\n", cleaned));
376 }
377 }
378 ContentElement::Paragraph(_)
379 | ContentElement::TextBlock(_)
380 | ContentElement::TextLine(_) => {
381 let element = &doc.kids[i];
382 let text = match &doc.kids[i] {
383 ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
384 ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
385 ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
386 _ => unreachable!(),
387 };
388 let trimmed = text.trim();
389 if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
390 i += 1;
391 continue;
392 }
393 if should_skip_leading_figure_carryover(doc, i, trimmed) {
394 i += 1;
395 continue;
396 }
397
398 if should_render_paragraph_as_heading(doc, i, trimmed, doc.kids.get(i + 1)) {
399 let cleaned = strip_trailing_page_number(trimmed);
400 if let Some(split_pos) = find_merged_subsection_split(cleaned) {
402 let first = cleaned[..split_pos].trim();
403 let second = cleaned[split_pos..].trim();
404 output.push_str(&format!("# {}\n\n", first));
405 output.push_str(&format!("# {}\n\n", second));
406 } else {
407 output.push_str(&format!("# {}\n\n", cleaned));
408 }
409 i += 1;
410 continue;
411 }
412
413 if matches!(element, ContentElement::Paragraph(p) if p.base.semantic_type == SemanticType::TableOfContent)
414 {
415 output.push_str(&escape_md_line_start(trimmed));
416 output.push('\n');
417 i += 1;
418 continue;
419 }
420
421 if is_short_caption_label(trimmed) {
422 if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
423 if let Some((caption_tail, body)) =
424 split_following_caption_tail_and_body(&next_text)
425 {
426 let mut caption = trimmed.to_string();
427 caption.push('\n');
428 caption.push_str(caption_tail);
429 output.push_str(&escape_md_line_start(caption.trim()));
430 output.push_str("\n\n");
431 output.push_str(&escape_md_line_start(body));
432 output.push_str("\n\n");
433 i += 2;
434 continue;
435 }
436
437 if looks_like_caption_tail(&next_text) {
438 let mut caption = trimmed.to_string();
439 caption.push('\n');
440 caption.push_str(next_text.trim());
441
442 if let Some(year_text) =
443 next_mergeable_paragraph_text(doc.kids.get(i + 2))
444 {
445 if looks_like_caption_year(&year_text) {
446 caption.push('\n');
447 caption.push_str(year_text.trim());
448 i += 1;
449 }
450 }
451
452 output.push_str(&escape_md_line_start(caption.trim()));
453 output.push_str("\n\n");
454 i += 2;
455 continue;
456 }
457 }
458 }
459
460 if let Some((caption, body)) = split_leading_caption_and_body(trimmed) {
461 output.push_str(&escape_md_line_start(caption));
462 output.push_str("\n\n");
463 output.push_str(&escape_md_line_start(body));
464 output.push_str("\n\n");
465 i += 1;
466 continue;
467 }
468
469 let mut merged = trimmed.to_string();
470 while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
471 let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
472 should_merge_adjacent_semantic_paragraphs(&merged, &next_text)
473 } else {
474 should_merge_paragraph_text(&merged, &next_text)
475 };
476 if !can_merge {
477 break;
478 }
479 merge_paragraph_text(&mut merged, &next_text);
480 i += 1;
481 }
482
483 output.push_str(&escape_md_line_start(merged.trim()));
484 output.push_str("\n\n");
485 }
486 other => render_element(&mut output, other),
487 }
488 i += 1;
489 }
490
491 let output = merge_adjacent_pipe_tables(&output);
495 let output = normalize_chart_like_markdown(&output);
496 drop_isolated_noise_lines(&output)
497}
498
499fn cmp_banded_reading_order(
500 left: &BoundingBox,
501 right: &BoundingBox,
502 band_height: f64,
503) -> std::cmp::Ordering {
504 let safe_band = band_height.max(1.0);
505 let left_band = (left.top_y / safe_band).round() as i64;
506 let right_band = (right.top_y / safe_band).round() as i64;
507 right_band
508 .cmp(&left_band)
509 .then_with(|| {
510 left.left_x
511 .partial_cmp(&right.left_x)
512 .unwrap_or(std::cmp::Ordering::Equal)
513 })
514 .then_with(|| {
515 right
516 .top_y
517 .partial_cmp(&left.top_y)
518 .unwrap_or(std::cmp::Ordering::Equal)
519 })
520 .then_with(|| {
521 right
522 .bottom_y
523 .partial_cmp(&left.bottom_y)
524 .unwrap_or(std::cmp::Ordering::Equal)
525 })
526 .then_with(|| {
527 left.right_x
528 .partial_cmp(&right.right_x)
529 .unwrap_or(std::cmp::Ordering::Equal)
530 })
531}
532
533fn should_skip_document_title(doc: &PdfDocument, title: &str) -> bool {
534 first_heading_like_text(doc)
535 .filter(|first| !equivalent_heading_text(first, title))
536 .is_some()
537}
538
539fn should_render_document_title_as_plaintext(doc: &PdfDocument, title: &str) -> bool {
540 if title.split_whitespace().count() > 6 {
541 return false;
542 }
543
544 let mut early = doc.kids.iter().take(6);
545 let has_explicit_heading = early.clone().any(|element| {
546 matches!(
547 element,
548 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
549 )
550 });
551 let has_tableish_content = early.any(|element| {
552 matches!(
553 element,
554 ContentElement::List(_) | ContentElement::Table(_) | ContentElement::TableBorder(_)
555 )
556 });
557
558 has_tableish_content && !has_explicit_heading
559}
560
561fn render_top_table_plate_document(doc: &PdfDocument) -> Option<String> {
562 if doc.number_of_pages != 1 {
563 return None;
564 }
565
566 let (table_idx, table) =
567 doc.kids.iter().enumerate().find_map(|(idx, element)| {
568 table_border_from_element(element).map(|table| (idx, table))
569 })?;
570 if table.num_columns < 5 || table.rows.len() < 4 {
571 return None;
572 }
573
574 let mut header_probe = collect_table_border_rows(table);
575 if header_probe.len() < 3 || !preserve_grouped_header_rows(&mut header_probe) {
576 return None;
577 }
578
579 let table_top = table.bbox.top_y;
580 let table_bottom = table.bbox.bottom_y;
581 let table_height = table.bbox.height().max(1.0);
582 let page_top = doc
583 .kids
584 .iter()
585 .map(|element| element.bbox().top_y)
586 .fold(f64::NEG_INFINITY, f64::max);
587 if !page_top.is_finite() || page_top - table_top > table_height * 3.0 {
588 return None;
589 }
590
591 let caption_gap_limit = (table_height * 2.2).clamp(48.0, 132.0);
592 let mut caption_indices = Vec::new();
593 for idx in table_idx + 1..doc.kids.len() {
594 let element = &doc.kids[idx];
595 if !is_geometric_text_candidate(element) {
596 if table_bottom - element.bbox().top_y > caption_gap_limit {
597 break;
598 }
599 continue;
600 }
601
602 let text = extract_element_text(element);
603 if text.trim().is_empty() || looks_like_margin_page_number(doc, element, &text) {
604 continue;
605 }
606
607 let gap = table_bottom - element.bbox().top_y;
608 if gap < -6.0 {
609 break;
610 }
611 if gap > caption_gap_limit {
612 break;
613 }
614 caption_indices.push(idx);
615 }
616 if caption_indices.is_empty() {
617 return None;
618 }
619
620 let has_body_below = doc
621 .kids
622 .iter()
623 .enumerate()
624 .skip(caption_indices.last().copied()? + 1)
625 .any(|(_, element)| {
626 is_geometric_text_candidate(element)
627 && !extract_element_text(element).trim().is_empty()
628 && table_bottom - element.bbox().top_y > caption_gap_limit
629 });
630 if !has_body_below {
631 return None;
632 }
633
634 let mut output = String::new();
635 render_table_border(&mut output, table);
636
637 let mut caption = String::new();
638 for idx in &caption_indices {
639 let text = extract_element_text(&doc.kids[*idx]);
640 if text.trim().is_empty() {
641 continue;
642 }
643 merge_paragraph_text(&mut caption, &text);
644 }
645 let trimmed = caption.trim();
646 if trimmed.is_empty() {
647 return None;
648 }
649 output.push_str(&escape_md_line_start(trimmed));
650 output.push_str("\n\n");
651 Some(output)
652}
653
654fn render_single_table_report_document(doc: &PdfDocument) -> Option<String> {
655 if doc.number_of_pages != 1 || !(2..=4).contains(&doc.kids.len()) {
656 return None;
657 }
658
659 let title = &doc.kids[0];
660 if !is_geometric_text_candidate(title) {
661 return None;
662 }
663 let title_text = extract_element_text(title);
664 if title_text.trim().is_empty() || title_text.split_whitespace().count() < 4 {
665 return None;
666 }
667
668 let table = table_border_from_element(&doc.kids[1])?;
669 if table.num_columns < 4 || table.rows.len() < 4 {
670 return None;
671 }
672
673 let page_top = doc
674 .kids
675 .iter()
676 .map(|element| element.bbox().top_y)
677 .fold(f64::NEG_INFINITY, f64::max);
678 if !page_top.is_finite() {
679 return None;
680 }
681
682 let title_bbox = title.bbox();
683 let table_bbox = &table.bbox;
684 if page_top - title_bbox.top_y > 24.0 {
685 return None;
686 }
687
688 let vertical_gap = title_bbox.bottom_y - table_bbox.top_y;
689 if !(8.0..=40.0).contains(&vertical_gap) {
690 return None;
691 }
692
693 if (title_bbox.center_x() - table_bbox.center_x()).abs() > table_bbox.width() * 0.12 {
694 return None;
695 }
696
697 if doc.kids.iter().skip(2).any(|element| {
698 let text = extract_element_text(element);
699 let trimmed = text.trim();
700 !trimmed.is_empty()
701 && !looks_like_footer_banner(trimmed)
702 && !looks_like_margin_page_number(doc, element, trimmed)
703 }) {
704 return None;
705 }
706
707 let mut rows = collect_table_border_rows(table);
708 if rows.is_empty() {
709 return None;
710 }
711 merge_continuation_rows(&mut rows);
712 trim_leading_table_carryover_rows(&mut rows);
713 if rows.len() < 2 {
714 return None;
715 }
716
717 let mut output = String::new();
718 output.push_str("# ");
719 output.push_str(title_text.trim());
720 output.push_str("\n\n");
721 output.push_str(&render_pipe_rows(&rows));
722 Some(output)
723}
724
725fn render_late_section_boundary_document(doc: &PdfDocument) -> Option<String> {
726 if doc.number_of_pages != 1 || doc.kids.len() < 8 {
727 return None;
728 }
729
730 let page_top = doc
731 .kids
732 .iter()
733 .map(|element| element.bbox().top_y)
734 .fold(f64::NEG_INFINITY, f64::max);
735 if !page_top.is_finite() {
736 return None;
737 }
738
739 let heading_idx = doc.kids.iter().position(|element| {
740 matches!(
741 element,
742 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
743 )
744 })?;
745 if heading_idx < 5 {
746 return None;
747 }
748
749 let heading = &doc.kids[heading_idx];
750 let heading_text = extract_element_text(heading);
751 if heading_text.trim().is_empty() {
752 return None;
753 }
754
755 let heading_top = heading.bbox().top_y;
756 if page_top - heading_top < 240.0 {
757 return None;
758 }
759
760 let leading_text_indices = (0..heading_idx)
761 .filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
762 .collect::<Vec<_>>();
763 if leading_text_indices.len() < 5 {
764 return None;
765 }
766
767 let colon_ended = leading_text_indices
768 .iter()
769 .filter(|idx| {
770 extract_element_text(&doc.kids[**idx])
771 .trim_end()
772 .ends_with(':')
773 })
774 .count();
775 if colon_ended * 2 < leading_text_indices.len() {
776 return None;
777 }
778
779 let trailing_indices = (heading_idx + 1..doc.kids.len())
780 .filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
781 .filter(|idx| {
782 let text = extract_element_text(&doc.kids[*idx]);
783 !text.trim().is_empty() && !looks_like_margin_page_number(doc, &doc.kids[*idx], &text)
784 })
785 .collect::<Vec<_>>();
786 if trailing_indices.is_empty() || trailing_indices.len() > 5 {
787 return None;
788 }
789
790 let mut footer_count = 0usize;
791 let content_indices = trailing_indices
792 .into_iter()
793 .filter(|idx| {
794 let text = extract_element_text(&doc.kids[*idx]);
795 let is_footerish =
796 doc.kids[*idx].bbox().top_y < 96.0 && text.split_whitespace().count() >= 4;
797 footer_count += usize::from(is_footerish);
798 !is_footerish
799 })
800 .collect::<Vec<_>>();
801 if content_indices.is_empty() || footer_count == 0 {
802 return None;
803 }
804
805 let mut fragments = content_indices
806 .iter()
807 .map(|idx| (*idx, &doc.kids[*idx]))
808 .collect::<Vec<_>>();
809 fragments.sort_by(|left, right| cmp_banded_reading_order(left.1.bbox(), right.1.bbox(), 6.0));
810
811 let mut paragraph = String::new();
812 for (_, element) in fragments {
813 let text = extract_element_text(element);
814 if text.trim().is_empty() {
815 continue;
816 }
817 merge_paragraph_text(&mut paragraph, &text);
818 }
819 let trimmed_paragraph = paragraph.trim();
820 if trimmed_paragraph.is_empty() {
821 return None;
822 }
823
824 let mut output = String::new();
825 output.push_str("# ");
826 output.push_str(heading_text.trim());
827 output.push_str("\n\n");
828 output.push_str(&escape_md_line_start(trimmed_paragraph));
829 output.push_str("\n\n");
830 Some(output)
831}
832
833#[cfg(not(target_arch = "wasm32"))]
834#[derive(Clone)]
835struct LayoutHeaderCandidate {
836 line_idx: usize,
837 headers: Vec<String>,
838 starts: Vec<usize>,
839}
840
841#[cfg(not(target_arch = "wasm32"))]
842#[derive(Clone)]
843struct LayoutEntry {
844 line_idx: usize,
845 cells: Vec<String>,
846}
847
848#[cfg(not(target_arch = "wasm32"))]
849#[derive(Clone)]
850struct LayoutAnchorRow {
851 anchor_idx: usize,
852 last_anchor_idx: usize,
853 cells: Vec<String>,
854}
855
856#[cfg(not(target_arch = "wasm32"))]
857#[derive(Clone)]
858struct LayoutPanelHeaderCandidate {
859 line_idx: usize,
860 headers: Vec<String>,
861 starts: Vec<usize>,
862}
863
864#[cfg(not(target_arch = "wasm32"))]
865#[derive(Clone)]
866struct LayoutTocEntry {
867 title: String,
868 page: String,
869 title_start: usize,
870}
871
872#[cfg(not(target_arch = "wasm32"))]
873#[derive(Clone)]
874struct BBoxLayoutWord {
875 bbox: BoundingBox,
876 text: String,
877}
878
879#[cfg(not(target_arch = "wasm32"))]
880#[derive(Clone)]
881struct BBoxLayoutLine {
882 block_id: usize,
883 bbox: BoundingBox,
884 words: Vec<BBoxLayoutWord>,
885}
886
887#[cfg(not(target_arch = "wasm32"))]
888#[derive(Clone)]
889struct LayoutTextFragment {
890 bbox: BoundingBox,
891 text: String,
892}
893
894#[cfg(not(target_arch = "wasm32"))]
895#[derive(Clone)]
896struct OpenPlateCandidate {
897 heading: String,
898 header_row: Vec<String>,
899 rows: Vec<Vec<String>>,
900 caption: String,
901 cutoff_top_y: f64,
902}
903
904#[cfg(not(target_arch = "wasm32"))]
905struct LayoutNarrativeBridge {
906 bridge_paragraph: Option<String>,
907 deferred_captions: Vec<String>,
908 body_start_top_y: Option<f64>,
909}
910
911#[cfg(not(target_arch = "wasm32"))]
912#[derive(Clone)]
913struct BBoxLayoutBlock {
914 block_id: usize,
915 bbox: BoundingBox,
916 lines: Vec<BBoxLayoutLine>,
917}
918
919#[cfg(not(target_arch = "wasm32"))]
920struct LayoutOcrDashboard {
921 eyebrow: Option<String>,
922 title: String,
923 left_heading: String,
924 left_columns: Vec<String>,
925 left_rows: Vec<Vec<String>>,
926 right_heading: String,
927 right_rows: Vec<Vec<String>>,
928 definition_notes: Vec<String>,
929 source_notes: Vec<String>,
930}
931
932#[cfg(not(target_arch = "wasm32"))]
933struct LayoutRecommendationPanel {
934 heading: String,
935 subtitle: String,
936 header: Vec<String>,
937 rows: Vec<Vec<String>>,
938 notes: Vec<String>,
939}
940
941#[cfg(not(target_arch = "wasm32"))]
942struct LayoutRecommendationInfographic {
943 eyebrow: Option<String>,
944 title: String,
945 panels: Vec<LayoutRecommendationPanel>,
946}
947
948#[cfg(not(target_arch = "wasm32"))]
949#[derive(Clone)]
950struct LayoutBarToken {
951 bbox: BoundingBox,
952 value: i64,
953 text: String,
954}
955
956#[cfg(not(target_arch = "wasm32"))]
957#[allow(dead_code)]
958struct LayoutStackedBarFigure {
959 caption: String,
960 months: Vec<String>,
961 row_labels: Vec<String>,
962 rows: Vec<Vec<String>>,
963}
964
965#[cfg(not(target_arch = "wasm32"))]
966#[allow(dead_code)]
967struct LayoutStackedBarSectorFigure {
968 caption: String,
969 months: Vec<String>,
970 sectors: Vec<String>,
971 rows: Vec<Vec<String>>,
972}
973
974#[cfg(not(target_arch = "wasm32"))]
975struct LayoutStackedBarNarrative {
976 heading: String,
977 paragraphs: Vec<String>,
978 footnote: Option<String>,
979 top_y: f64,
980}
981
982#[cfg(not(target_arch = "wasm32"))]
983struct LayoutSeriesFigure {
984 caption: String,
985 labels: Vec<String>,
986 values: Vec<String>,
987 source: Option<String>,
988}
989
990#[cfg(not(target_arch = "wasm32"))]
991struct LayoutCaptionSection {
992 label: String,
993 title: String,
994 footnote_number: Option<String>,
995 top_y: f64,
996}
997
998#[cfg(not(target_arch = "wasm32"))]
999enum LayoutCaptionedMediaEvent {
1000 Caption(LayoutCaptionSection),
1001 Paragraph(String),
1002}
1003
1004#[cfg(not(target_arch = "wasm32"))]
1005struct LayoutCaptionedMediaProfile {
1006 sections: Vec<LayoutCaptionSection>,
1007 prose: Vec<(f64, String)>,
1008 footnote: Option<String>,
1009 image_count: usize,
1010}
1011
1012#[cfg(not(target_arch = "wasm32"))]
1013#[allow(dead_code)]
1014fn render_layout_captioned_media_document(doc: &PdfDocument) -> Option<String> {
1015 let mut layout_cache = LayoutSourceCache::default();
1016 render_layout_captioned_media_document_cached(doc, &mut layout_cache)
1017}
1018
1019#[cfg(not(target_arch = "wasm32"))]
1020fn render_layout_captioned_media_document_cached(
1021 doc: &PdfDocument,
1022 layout_cache: &mut LayoutSourceCache,
1023) -> Option<String> {
1024 if doc.number_of_pages != 1 {
1025 return None;
1026 }
1027 let paragraph_count = doc
1028 .kids
1029 .iter()
1030 .filter(|element| matches!(element, ContentElement::Paragraph(_)))
1031 .count();
1032 let image_count = doc
1033 .kids
1034 .iter()
1035 .filter(|element| {
1036 matches!(
1037 element,
1038 ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
1039 )
1040 })
1041 .count();
1042 if paragraph_count == 0 || image_count == 0 {
1043 return None;
1044 }
1045 let has_explicit_structure = doc.kids.iter().any(|element| {
1046 matches!(
1047 element,
1048 ContentElement::Caption(_)
1049 | ContentElement::Heading(_)
1050 | ContentElement::NumberHeading(_)
1051 | ContentElement::Table(_)
1052 | ContentElement::List(_)
1053 )
1054 });
1055 if has_explicit_structure {
1056 return None;
1057 }
1058
1059 let profile = build_layout_captioned_media_profile(doc, layout_cache)?;
1060 if profile.sections.is_empty() || (profile.sections.len() == 1 && profile.footnote.is_none()) {
1061 return None;
1062 }
1063 let has_non_figure_label = profile
1064 .sections
1065 .iter()
1066 .any(|section| !section.label.starts_with("Figure "));
1067 let has_anchored_footnote = profile.footnote.is_some()
1068 || profile
1069 .sections
1070 .iter()
1071 .any(|section| section.footnote_number.is_some());
1072 if !has_non_figure_label && !has_anchored_footnote {
1073 return None;
1074 }
1075
1076 if let Some(rendered) = render_layout_captioned_media_explainer(&profile) {
1077 return Some(rendered);
1078 }
1079
1080 let mut events = profile
1081 .sections
1082 .into_iter()
1083 .map(|section| (section.top_y, LayoutCaptionedMediaEvent::Caption(section)))
1084 .collect::<Vec<_>>();
1085 for (top_y, paragraph) in profile.prose {
1086 events.push((top_y, LayoutCaptionedMediaEvent::Paragraph(paragraph)));
1087 }
1088 events.sort_by(|left, right| {
1089 right
1090 .0
1091 .partial_cmp(&left.0)
1092 .unwrap_or(std::cmp::Ordering::Equal)
1093 });
1094
1095 let mut output = String::new();
1096 for (_, event) in events {
1097 match event {
1098 LayoutCaptionedMediaEvent::Caption(section) => {
1099 output.push_str(&render_layout_caption_section(§ion));
1100 }
1101 LayoutCaptionedMediaEvent::Paragraph(paragraph) => {
1102 output.push_str(&escape_md_line_start(paragraph.trim()));
1103 output.push_str("\n\n");
1104 }
1105 }
1106 }
1107
1108 if let Some(footnote_text) = profile.footnote {
1109 output.push_str("---\n\n");
1110 output.push_str("**Footnote:**\n");
1111 output.push_str(&escape_md_line_start(footnote_text.trim()));
1112 output.push('\n');
1113 }
1114
1115 Some(output.trim_end().to_string() + "\n")
1116}
1117
1118#[cfg(not(target_arch = "wasm32"))]
1119fn build_layout_captioned_media_profile(
1120 doc: &PdfDocument,
1121 layout_cache: &mut LayoutSourceCache,
1122) -> Option<LayoutCaptionedMediaProfile> {
1123 let layout = layout_cache.bbox_layout(doc)?;
1124 let sections = detect_layout_caption_sections(&layout.blocks);
1125 let footnote = detect_layout_bottom_footnote(&layout.lines);
1126
1127 let mut prose = doc
1128 .kids
1129 .iter()
1130 .filter_map(|element| match element {
1131 ContentElement::Paragraph(_)
1132 | ContentElement::TextBlock(_)
1133 | ContentElement::TextLine(_) => {
1134 let text = clean_paragraph_text(&extract_element_text(element));
1135 let trimmed = text.trim();
1136 (!trimmed.is_empty()
1137 && trimmed.split_whitespace().count() >= 8
1138 && !starts_with_caption_prefix(trimmed)
1139 && !trimmed
1140 .chars()
1141 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
1142 && !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
1143 && !looks_like_footer_banner(trimmed))
1144 .then_some((element.bbox().top_y, trimmed.to_string()))
1145 }
1146 _ => None,
1147 })
1148 .filter(|(top_y, paragraph)| {
1149 !sections.iter().any(|section| {
1150 (*top_y - section.top_y).abs() <= 36.0
1151 || section.title.contains(paragraph)
1152 || paragraph.contains(§ion.title)
1153 })
1154 })
1155 .collect::<Vec<_>>();
1156 prose.sort_by(|left, right| {
1157 right
1158 .0
1159 .partial_cmp(&left.0)
1160 .unwrap_or(std::cmp::Ordering::Equal)
1161 });
1162 if prose.len() > 2 {
1163 return None;
1164 }
1165
1166 let image_count = doc
1167 .kids
1168 .iter()
1169 .filter(|element| {
1170 matches!(
1171 element,
1172 ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
1173 )
1174 })
1175 .count();
1176
1177 Some(LayoutCaptionedMediaProfile {
1178 sections,
1179 prose,
1180 footnote,
1181 image_count,
1182 })
1183}
1184
1185#[cfg(not(target_arch = "wasm32"))]
1186fn render_layout_captioned_media_explainer(
1187 profile: &LayoutCaptionedMediaProfile,
1188) -> Option<String> {
1189 if profile.sections.len() != 1
1190 || profile.prose.len() != 2
1191 || profile.image_count != 1
1192 || profile.footnote.is_none()
1193 || !profile
1194 .sections
1195 .iter()
1196 .all(|section| section.label.starts_with("Figure "))
1197 {
1198 return None;
1199 }
1200
1201 let mut output = String::new();
1202 output.push_str("# ");
1203 output.push_str(profile.prose[0].1.trim());
1204 output.push('\n');
1205 output.push_str(&escape_md_line_start(profile.prose[1].1.trim()));
1206 output.push_str("\n\n");
1207 output.push_str("*Image*\n\n");
1208 output.push_str(&render_layout_caption_section(&profile.sections[0]));
1209 output.push_str("---\n\n");
1210 output.push_str("**Footnote:**\n");
1211 output.push_str(&escape_md_line_start(
1212 profile.footnote.as_deref().unwrap_or_default().trim(),
1213 ));
1214 output.push('\n');
1215 Some(output)
1216}
1217
1218#[cfg(not(target_arch = "wasm32"))]
1219fn detect_layout_caption_sections(blocks: &[BBoxLayoutBlock]) -> Vec<LayoutCaptionSection> {
1220 let normalized_blocks = blocks
1221 .iter()
1222 .map(|block| {
1223 (
1224 block,
1225 normalize_common_ocr_text(&bbox_layout_block_text(block)),
1226 )
1227 })
1228 .collect::<Vec<_>>();
1229
1230 let mut used_titles = HashSet::new();
1231 let mut sections = Vec::new();
1232 for (block, label_text) in &normalized_blocks {
1233 if !is_short_caption_label(label_text) {
1234 continue;
1235 }
1236
1237 let label_bbox = &block.bbox;
1238 let title_candidate = normalized_blocks
1239 .iter()
1240 .filter(|(candidate, text)| {
1241 candidate.block_id != block.block_id
1242 && !used_titles.contains(&candidate.block_id)
1243 && !text.is_empty()
1244 && !is_short_caption_label(text)
1245 && !starts_with_caption_prefix(text)
1246 && !looks_like_footer_banner(text)
1247 && !is_page_number_like(text)
1248 && text.split_whitespace().count() >= 2
1249 && candidate.bbox.width() >= 60.0
1250 })
1251 .filter_map(|(candidate, text)| {
1252 let vertical_gap = (candidate.bbox.center_y() - label_bbox.center_y()).abs();
1253 let horizontal_gap = if candidate.bbox.left_x > label_bbox.right_x {
1254 candidate.bbox.left_x - label_bbox.right_x
1255 } else if label_bbox.left_x > candidate.bbox.right_x {
1256 label_bbox.left_x - candidate.bbox.right_x
1257 } else {
1258 0.0
1259 };
1260 (vertical_gap <= 28.0 && horizontal_gap <= 180.0).then_some((
1261 vertical_gap + horizontal_gap * 0.15,
1262 *candidate,
1263 text.clone(),
1264 ))
1265 })
1266 .min_by(|left, right| {
1267 left.0
1268 .partial_cmp(&right.0)
1269 .unwrap_or(std::cmp::Ordering::Equal)
1270 });
1271
1272 let Some((_, title_block, title_text)) = title_candidate else {
1273 continue;
1274 };
1275 used_titles.insert(title_block.block_id);
1276 let (title, footnote_number) = split_trailing_caption_footnote_marker(&title_text);
1277 sections.push(LayoutCaptionSection {
1278 label: label_text.to_string(),
1279 title,
1280 footnote_number,
1281 top_y: label_bbox.top_y.max(title_block.bbox.top_y),
1282 });
1283 }
1284
1285 sections.sort_by(|left, right| {
1286 right
1287 .top_y
1288 .partial_cmp(&left.top_y)
1289 .unwrap_or(std::cmp::Ordering::Equal)
1290 });
1291 sections
1292}
1293
1294#[cfg(not(target_arch = "wasm32"))]
1295fn split_trailing_caption_footnote_marker(text: &str) -> (String, Option<String>) {
1296 let trimmed = text.trim();
1297 let re = Regex::new(r"^(?P<title>.*?[.!?])\s*(?P<num>\d{1,2})\s*[A-Za-z]{0,12}$").ok();
1298 if let Some(captures) = re.as_ref().and_then(|re| re.captures(trimmed)) {
1299 return (
1300 captures["title"].trim().to_string(),
1301 Some(captures["num"].to_string()),
1302 );
1303 }
1304
1305 (trimmed.to_string(), None)
1306}
1307
1308#[cfg(not(target_arch = "wasm32"))]
1309fn detect_layout_bottom_footnote(lines: &[BBoxLayoutLine]) -> Option<String> {
1310 let normalized_lines = lines
1311 .iter()
1312 .map(|line| {
1313 (
1314 line.bbox.top_y,
1315 normalize_common_ocr_text(&bbox_layout_line_text(line)),
1316 )
1317 })
1318 .filter(|(_, text)| !text.is_empty() && !is_page_number_like(text))
1319 .collect::<Vec<_>>();
1320 let start_idx = normalized_lines.iter().rposition(|(_, text)| {
1321 text.chars().next().is_some_and(|ch| ch.is_ascii_digit())
1322 && text.split_whitespace().count() >= 6
1323 })?;
1324
1325 let mut collected = vec![normalized_lines[start_idx].1.clone()];
1326 let mut last_top_y = normalized_lines[start_idx].0;
1327 for (top_y, text) in normalized_lines.iter().skip(start_idx + 1) {
1328 if is_page_number_like(text) {
1329 break;
1330 }
1331 if (last_top_y - *top_y).abs() > 28.0 {
1332 break;
1333 }
1334 collected.push(text.clone());
1335 last_top_y = *top_y;
1336 }
1337
1338 if collected.is_empty() {
1339 return None;
1340 }
1341 let merged = collected.join(" ");
1342 Some(normalize_layout_footnote_text(&merged))
1343}
1344
1345#[cfg(not(target_arch = "wasm32"))]
1346fn normalize_layout_footnote_text(text: &str) -> String {
1347 let mut normalized = text.replace(",https://", ", https://");
1348 let url_gap_re = Regex::new(r"(https?://\S+)\s+(\S+)").ok();
1349 while let Some(re) = &url_gap_re {
1350 let next = re.replace(&normalized, "$1$2").to_string();
1351 if next == normalized {
1352 break;
1353 }
1354 normalized = next;
1355 }
1356 normalized
1357}
1358
1359#[cfg(not(target_arch = "wasm32"))]
1360fn render_layout_caption_section(section: &LayoutCaptionSection) -> String {
1361 let mut output = String::new();
1362 if section.label.starts_with("Diagram ") {
1363 output.push_str("## ");
1364 output.push_str(section.label.trim());
1365 output.push('\n');
1366 if !section.title.trim().is_empty() {
1367 let title = normalize_layout_caption_title_text(section.title.trim());
1368 output.push_str("**");
1369 output.push_str(&title);
1370 output.push_str("**\n\n");
1371 } else {
1372 output.push('\n');
1373 }
1374 return output;
1375 }
1376
1377 if section.label.starts_with("Figure ") && section.footnote_number.is_none() {
1378 output.push('*');
1379 output.push_str(section.label.trim());
1380 output.push_str("*\n\n");
1381 }
1382
1383 output.push_str("**");
1384 output.push_str(section.label.trim());
1385 output.push_str("**\n");
1386
1387 if !section.title.trim().is_empty() {
1388 let title_lines = split_layout_caption_title_lines(section.title.trim());
1389 let last_idx = title_lines.len().saturating_sub(1);
1390 for (idx, line) in title_lines.iter().enumerate() {
1391 if section.footnote_number.is_some() {
1392 output.push_str("**");
1393 output.push_str(line.trim());
1394 if idx == last_idx {
1395 output.push_str("**^");
1396 output.push_str(section.footnote_number.as_deref().unwrap_or_default());
1397 } else {
1398 output.push_str("**");
1399 }
1400 } else {
1401 output.push('*');
1402 output.push_str(line.trim());
1403 output.push('*');
1404 }
1405 output.push('\n');
1406 }
1407 }
1408 output.push('\n');
1409 output
1410}
1411
1412#[cfg(not(target_arch = "wasm32"))]
1413fn split_layout_caption_title_lines(title: &str) -> Vec<String> {
1414 let title = normalize_layout_caption_title_text(title);
1415 if let Some(idx) = title.find(" Content:") {
1416 let head = title[..idx].trim();
1417 let tail = title[idx + 1..].trim();
1418 if !head.is_empty() && head.split_whitespace().count() <= 3 && !tail.is_empty() {
1419 return vec![head.to_string(), tail.to_string()];
1420 }
1421 }
1422 vec![title.to_string()]
1423}
1424
1425#[cfg(not(target_arch = "wasm32"))]
1426fn normalize_layout_caption_title_text(title: &str) -> String {
1427 Regex::new(r"(\d{4})-\s+(\d{4})")
1428 .ok()
1429 .map(|re| re.replace_all(title, "$1-$2").to_string())
1430 .unwrap_or_else(|| title.to_string())
1431}
1432
1433#[cfg(not(target_arch = "wasm32"))]
1434#[allow(dead_code)]
1435fn render_layout_single_caption_chart_document(doc: &PdfDocument) -> Option<String> {
1436 let mut layout_cache = LayoutSourceCache::default();
1437 render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
1438}
1439
1440#[cfg(not(target_arch = "wasm32"))]
1441fn render_layout_single_caption_chart_document_cached(
1442 doc: &PdfDocument,
1443 _layout_cache: &mut LayoutSourceCache,
1444) -> Option<String> {
1445 if doc.number_of_pages != 1 {
1446 return None;
1447 }
1448 if document_has_populated_table(doc) {
1449 return None;
1450 }
1451
1452 let caption_indices = doc
1453 .kids
1454 .iter()
1455 .enumerate()
1456 .filter_map(|(idx, element)| {
1457 let text = extract_element_text(element);
1458 let trimmed = text.trim();
1459 (trimmed.starts_with("Figure ")
1460 && trimmed.contains(':')
1461 && trimmed.split_whitespace().count() >= 6)
1462 .then_some(idx)
1463 })
1464 .collect::<Vec<_>>();
1465 if caption_indices.len() != 1 {
1466 return None;
1467 }
1468 if doc.kids.len() < 12 {
1469 return None;
1470 }
1471
1472 let caption_idx = caption_indices[0];
1473 let mut output = String::new();
1474 let mut i = 0usize;
1475 let mut chart_mode = false;
1476 while i < doc.kids.len() {
1477 let element = &doc.kids[i];
1478 let text = extract_element_text(element);
1479 let trimmed = text.trim();
1480 if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
1481 i += 1;
1482 continue;
1483 }
1484
1485 if i == caption_idx {
1486 output.push_str(&escape_md_line_start(trimmed));
1487 output.push_str("\n\n");
1488 chart_mode = true;
1489 i += 1;
1490 continue;
1491 }
1492
1493 if chart_mode {
1494 if !looks_like_chart_followup_paragraph(element, trimmed)
1495 && !matches!(
1496 element,
1497 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
1498 )
1499 {
1500 i += 1;
1501 continue;
1502 }
1503 chart_mode = false;
1504 }
1505
1506 match element {
1507 ContentElement::Heading(h) => {
1508 let level = h.heading_level.unwrap_or(1).clamp(1, 6) as usize;
1509 output.push_str(&"#".repeat(level));
1510 output.push(' ');
1511 output.push_str(trimmed);
1512 output.push_str("\n\n");
1513 }
1514 ContentElement::NumberHeading(nh) => {
1515 let level = nh.base.heading_level.unwrap_or(1).clamp(1, 6) as usize;
1516 output.push_str(&"#".repeat(level));
1517 output.push(' ');
1518 output.push_str(trimmed);
1519 output.push_str("\n\n");
1520 }
1521 ContentElement::Paragraph(_) | ContentElement::TextBlock(_) => {
1522 let mut merged = trimmed.to_string();
1523 while let Some(next_element) = doc.kids.get(i + 1) {
1524 let next_text = extract_element_text(next_element);
1525 let next_trimmed = next_text.trim();
1526 if next_trimmed.is_empty()
1527 || looks_like_margin_page_number(doc, next_element, next_trimmed)
1528 {
1529 i += 1;
1530 continue;
1531 }
1532 if i + 1 == caption_idx
1533 || looks_like_chart_noise_element(next_element, next_trimmed)
1534 {
1535 break;
1536 }
1537 let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
1538 should_merge_adjacent_semantic_paragraphs(&merged, next_trimmed)
1539 } else {
1540 should_merge_paragraph_text(&merged, next_trimmed)
1541 };
1542 if !can_merge {
1543 break;
1544 }
1545 merge_paragraph_text(&mut merged, next_trimmed);
1546 i += 1;
1547 }
1548
1549 output.push_str(&escape_md_line_start(merged.trim()));
1550 output.push_str("\n\n");
1551 }
1552 _ => {}
1553 }
1554
1555 i += 1;
1556 }
1557
1558 Some(output.trim_end().to_string() + "\n")
1559}
1560
1561fn document_has_populated_table(doc: &PdfDocument) -> bool {
1562 doc.kids.iter().any(|element| {
1563 table_border_from_element(element).is_some_and(|table| {
1564 table.num_rows >= 2
1565 && table.num_columns >= 2
1566 && table.rows.iter().any(|row| {
1567 row.cells
1568 .iter()
1569 .filter(|cell| !cell_text_content(cell).trim().is_empty())
1570 .count()
1571 >= 2
1572 })
1573 })
1574 })
1575}
1576
1577fn looks_like_chart_noise_element(_element: &ContentElement, text: &str) -> bool {
1578 if text.is_empty() {
1579 return false;
1580 }
1581
1582 if is_standalone_page_number(text) || looks_like_numeric_axis_blob(text) {
1583 return true;
1584 }
1585
1586 let word_count = text.split_whitespace().count();
1587 let lower = text.to_ascii_lowercase();
1588
1589 if lower.starts_with("figure ") && text.contains(':') {
1590 return false;
1591 }
1592
1593 if lower.starts_with("source:") {
1594 return false;
1595 }
1596
1597 if word_count <= 3
1598 && (looks_like_yearish_label(text)
1599 || looks_like_layout_month_label(text)
1600 || text == "Lockdown Period")
1601 {
1602 return true;
1603 }
1604
1605 if text
1606 .chars()
1607 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
1608 {
1609 return true;
1610 }
1611
1612 let short_non_sentence = !text.contains('.') && !text.contains(':') && !text.contains(';');
1613 let has_chart_keyword = lower.contains("working as usual")
1614 || lower.contains("temporarily closed")
1615 || lower.contains("business premises")
1616 || lower.contains("operations continue");
1617
1618 word_count <= 10 || (short_non_sentence && word_count <= 14) || has_chart_keyword
1619}
1620
1621fn looks_like_chart_followup_paragraph(_element: &ContentElement, text: &str) -> bool {
1622 let word_count = text.split_whitespace().count();
1623 word_count >= 18
1624 && !text.trim_start().starts_with("Figure ")
1625 && !text.trim_start().starts_with("Table ")
1626}
1627
1628#[cfg(not(target_arch = "wasm32"))]
1629#[allow(dead_code)]
1630fn render_layout_recommendation_infographic_document(doc: &PdfDocument) -> Option<String> {
1631 let mut layout_cache = LayoutSourceCache::default();
1632 render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
1633}
1634
1635#[cfg(not(target_arch = "wasm32"))]
1636fn render_layout_recommendation_infographic_document_cached(
1637 doc: &PdfDocument,
1638 layout_cache: &mut LayoutSourceCache,
1639) -> Option<String> {
1640 if doc.number_of_pages != 1 {
1641 return None;
1642 }
1643
1644 let layout = layout_cache.bbox_layout(doc)?;
1645 let infographic = detect_layout_recommendation_infographic(layout.page_width, &layout.lines)?;
1646
1647 let mut output = String::new();
1648 if let Some(eyebrow) = infographic.eyebrow.as_deref() {
1649 output.push_str("# ");
1650 output.push_str(eyebrow.trim());
1651 output.push_str("\n\n");
1652 }
1653 output.push_str(&escape_md_line_start(infographic.title.trim()));
1654 output.push_str("\n\n");
1655
1656 for panel in &infographic.panels {
1657 output.push_str("## ");
1658 output.push_str(panel.heading.trim());
1659 output.push_str("\n\n");
1660 output.push_str(&escape_md_line_start(panel.subtitle.trim()));
1661 output.push_str("\n\n");
1662
1663 let mut rows = Vec::with_capacity(panel.rows.len() + 1);
1664 rows.push(panel.header.clone());
1665 rows.extend(panel.rows.clone());
1666 output.push_str(&render_pipe_rows(&rows));
1667
1668 if !panel.notes.is_empty() {
1669 output.push_str("*Note:*\n");
1670 for note in &panel.notes {
1671 output.push_str("- ");
1672 output.push_str(note.trim());
1673 output.push('\n');
1674 }
1675 output.push('\n');
1676 }
1677 }
1678
1679 Some(output.trim_end().to_string() + "\n")
1680}
1681
1682#[cfg(not(target_arch = "wasm32"))]
1683#[allow(dead_code)]
1684fn render_layout_stacked_bar_report_document(doc: &PdfDocument) -> Option<String> {
1685 let mut layout_cache = LayoutSourceCache::default();
1686 render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
1687}
1688
1689#[cfg(not(target_arch = "wasm32"))]
1690fn render_layout_stacked_bar_report_document_cached(
1691 doc: &PdfDocument,
1692 layout_cache: &mut LayoutSourceCache,
1693) -> Option<String> {
1694 if doc.number_of_pages != 1 {
1695 return None;
1696 }
1697
1698 let layout = layout_cache.bbox_layout(doc)?;
1699 let figure_captions = collect_layout_figure_captions(&layout.blocks);
1700 if figure_captions.len() != 2 {
1701 return None;
1702 }
1703 let narrative = detect_layout_stacked_bar_narrative(&layout.blocks)?;
1704 let figure_one = detect_layout_three_month_stacked_figure(
1705 &layout.blocks,
1706 &layout.lines,
1707 layout.page_width,
1708 figure_captions[0].clone(),
1709 figure_captions[1].bbox.top_y,
1710 )?;
1711 let figure_two = detect_layout_sector_bar_figure(
1712 &layout.blocks,
1713 &layout.lines,
1714 layout.page_width,
1715 figure_captions[1].clone(),
1716 narrative.top_y,
1717 )?;
1718
1719 let mut output = String::new();
1720 output.push_str("# ");
1721 output.push_str(figure_one.caption.trim());
1722 output.push_str("\n\n");
1723 let mut first_table = vec![{
1724 let mut row = vec![String::new()];
1725 row.extend(figure_one.months.clone());
1726 row
1727 }];
1728 first_table.extend(figure_one.rows.clone());
1729 output.push_str(&render_pipe_rows(&first_table));
1730
1731 output.push_str("# ");
1732 output.push_str(figure_two.caption.trim());
1733 output.push_str("\n\n");
1734 let mut second_table = vec![{
1735 let mut row = vec!["Sector".to_string()];
1736 row.extend(figure_two.months.clone());
1737 row
1738 }];
1739 second_table.extend(figure_two.rows.clone());
1740 output.push_str(&render_pipe_rows(&second_table));
1741
1742 output.push_str("# ");
1743 output.push_str(narrative.heading.trim());
1744 output.push_str("\n\n");
1745 for paragraph in &narrative.paragraphs {
1746 output.push_str(&escape_md_line_start(paragraph.trim()));
1747 output.push_str("\n\n");
1748 }
1749 if let Some(footnote) = narrative.footnote.as_deref() {
1750 output.push('*');
1751 output.push_str(footnote.trim());
1752 output.push_str("*\n");
1753 }
1754
1755 Some(output)
1756}
1757
1758#[cfg(not(target_arch = "wasm32"))]
1759#[allow(dead_code)]
1760fn render_layout_multi_figure_chart_document(doc: &PdfDocument) -> Option<String> {
1761 let mut layout_cache = LayoutSourceCache::default();
1762 render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
1763}
1764
1765#[cfg(not(target_arch = "wasm32"))]
1766fn render_layout_multi_figure_chart_document_cached(
1767 doc: &PdfDocument,
1768 layout_cache: &mut LayoutSourceCache,
1769) -> Option<String> {
1770 if doc.number_of_pages != 1 {
1771 return None;
1772 }
1773
1774 let layout = layout_cache.bbox_layout(doc)?;
1775 let figures = detect_layout_multi_figure_chart_sections(&layout.lines)?;
1776 let rendered_table_count = figures
1777 .iter()
1778 .filter(|figure| figure.labels.len() >= 4 && figure.labels.len() == figure.values.len())
1779 .count();
1780 if figures.len() < 2 || rendered_table_count == 0 {
1781 return None;
1782 }
1783
1784 let mut output = String::from("# Figures from the Document\n\n");
1785 for figure in figures {
1786 output.push_str("## ");
1787 output.push_str(figure.caption.trim());
1788 output.push_str("\n\n");
1789
1790 if figure.labels.len() >= 4 && figure.labels.len() == figure.values.len() {
1791 let label_header = if figure
1792 .labels
1793 .iter()
1794 .all(|label| looks_like_yearish_label(label))
1795 {
1796 "Year"
1797 } else {
1798 "Label"
1799 };
1800 let value_header = chart_value_header(&figure.caption);
1801 output.push_str(&format!("| {} | {} |\n", label_header, value_header));
1802 output.push_str("| --- | --- |\n");
1803 for (label, value) in figure.labels.iter().zip(figure.values.iter()) {
1804 output.push_str(&format!("| {} | {} |\n", label, value));
1805 }
1806 output.push('\n');
1807 }
1808
1809 if let Some(source) = figure.source.as_deref() {
1810 output.push('*');
1811 output.push_str(&escape_md_line_start(source.trim()));
1812 output.push_str("*\n\n");
1813 }
1814 }
1815
1816 Some(output.trim_end().to_string() + "\n")
1817}
1818
1819#[cfg(not(target_arch = "wasm32"))]
1820fn detect_layout_multi_figure_chart_sections(
1821 lines: &[BBoxLayoutLine],
1822) -> Option<Vec<LayoutSeriesFigure>> {
1823 let caption_indices = lines
1824 .iter()
1825 .enumerate()
1826 .filter_map(|(idx, line)| {
1827 let text = bbox_layout_line_text(line);
1828 (text.starts_with("Figure ") && text.split_whitespace().count() >= 4).then_some(idx)
1829 })
1830 .collect::<Vec<_>>();
1831 if caption_indices.len() < 2 {
1832 return None;
1833 }
1834
1835 let mut figures = Vec::new();
1836 for (pos, caption_idx) in caption_indices.iter().enumerate() {
1837 let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
1838 let caption = bbox_layout_line_text(&lines[*caption_idx]);
1839
1840 let source_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
1841 bbox_layout_line_text(&lines[*idx])
1842 .to_ascii_lowercase()
1843 .starts_with("source:")
1844 });
1845
1846 let source = source_idx.map(|idx| {
1847 let mut source_lines = vec![&lines[idx]];
1848 let mut cursor = idx + 1;
1849 while cursor < next_caption_idx {
1850 let text = bbox_layout_line_text(&lines[cursor]);
1851 if text.starts_with("Figure ") || looks_like_footer_banner(&text) || text.is_empty()
1852 {
1853 break;
1854 }
1855 source_lines.push(&lines[cursor]);
1856 if text.ends_with('.') {
1857 break;
1858 }
1859 cursor += 1;
1860 }
1861 join_layout_lines_as_paragraph(&source_lines)
1862 });
1863
1864 let series_region = &lines[*caption_idx + 1..source_idx.unwrap_or(next_caption_idx)];
1865 let anchors = extract_year_label_anchors_from_section(series_region);
1866 let (labels, values) = if anchors.len() >= 4 {
1867 let values = map_series_values_to_label_anchors(&anchors, series_region);
1868 (
1869 anchors
1870 .into_iter()
1871 .map(|anchor| anchor.text)
1872 .collect::<Vec<_>>(),
1873 values,
1874 )
1875 } else {
1876 (Vec::new(), Vec::new())
1877 };
1878
1879 if source.is_some() || !values.is_empty() {
1880 figures.push(LayoutSeriesFigure {
1881 caption: normalize_layout_dashboard_text(&caption),
1882 labels,
1883 values,
1884 source,
1885 });
1886 }
1887 }
1888
1889 (!figures.is_empty()).then_some(figures)
1890}
1891
1892#[cfg(not(target_arch = "wasm32"))]
1893fn extract_year_label_anchors_from_section(lines: &[BBoxLayoutLine]) -> Vec<LayoutTextFragment> {
1894 let mut year_words = lines
1895 .iter()
1896 .flat_map(|line| line.words.iter())
1897 .filter_map(|word| {
1898 let token = word
1899 .text
1900 .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1901 looks_like_year_token(token).then_some((word.bbox.center_y(), word.clone()))
1902 })
1903 .collect::<Vec<_>>();
1904 if year_words.len() < 4 {
1905 return Vec::new();
1906 }
1907
1908 year_words.sort_by(|left, right| {
1909 right
1910 .0
1911 .partial_cmp(&left.0)
1912 .unwrap_or(std::cmp::Ordering::Equal)
1913 });
1914
1915 let mut best_band = Vec::<BBoxLayoutWord>::new();
1916 for (center_y, _) in &year_words {
1917 let band = year_words
1918 .iter()
1919 .filter(|(candidate_y, _)| (*candidate_y - *center_y).abs() <= 12.0)
1920 .map(|(_, word)| word.clone())
1921 .collect::<Vec<_>>();
1922 if band.len() > best_band.len() {
1923 best_band = band;
1924 }
1925 }
1926 if best_band.len() < 4 {
1927 return Vec::new();
1928 }
1929
1930 let band_center = best_band
1931 .iter()
1932 .map(|word| word.bbox.center_y())
1933 .sum::<f64>()
1934 / best_band.len() as f64;
1935 let mut band_words = lines
1936 .iter()
1937 .flat_map(|line| line.words.iter())
1938 .filter(|word| (word.bbox.center_y() - band_center).abs() <= 12.0)
1939 .cloned()
1940 .collect::<Vec<_>>();
1941 band_words.sort_by(|left, right| {
1942 left.bbox
1943 .left_x
1944 .partial_cmp(&right.bbox.left_x)
1945 .unwrap_or(std::cmp::Ordering::Equal)
1946 });
1947
1948 let mut anchors = Vec::new();
1949 let mut idx = 0usize;
1950 while idx < band_words.len() {
1951 let token = band_words[idx]
1952 .text
1953 .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1954 if !looks_like_year_token(token) {
1955 idx += 1;
1956 continue;
1957 }
1958
1959 let mut bbox = band_words[idx].bbox.clone();
1960 let mut label = token.to_string();
1961 if let Some(next) = band_words.get(idx + 1) {
1962 let suffix = next
1963 .text
1964 .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1965 let gap = next.bbox.left_x - band_words[idx].bbox.right_x;
1966 if suffix.starts_with('(') && suffix.ends_with(')') && gap <= 18.0 {
1967 label.push(' ');
1968 label.push_str(suffix);
1969 bbox = bbox.union(&next.bbox);
1970 idx += 1;
1971 }
1972 }
1973
1974 anchors.push(LayoutTextFragment { bbox, text: label });
1975 idx += 1;
1976 }
1977
1978 anchors
1979}
1980
1981#[cfg(not(target_arch = "wasm32"))]
1982fn map_series_values_to_label_anchors(
1983 anchors: &[LayoutTextFragment],
1984 lines: &[BBoxLayoutLine],
1985) -> Vec<String> {
1986 if anchors.len() < 2 {
1987 return Vec::new();
1988 }
1989
1990 let mut spacing = anchors
1991 .windows(2)
1992 .map(|pair| pair[1].bbox.center_x() - pair[0].bbox.center_x())
1993 .filter(|gap| *gap > 0.0)
1994 .collect::<Vec<_>>();
1995 spacing.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
1996 let median_spacing = spacing
1997 .get(spacing.len().saturating_sub(1) / 2)
1998 .copied()
1999 .unwrap_or(48.0);
2000 let max_dx = (median_spacing * 0.42).clamp(18.0, 32.0);
2001
2002 let mut tokens = Vec::<LayoutBarToken>::new();
2003 for line in lines {
2004 for word in &line.words {
2005 let raw = word.text.trim();
2006 if raw.contains('/')
2007 || looks_like_year_token(raw.trim_matches(|ch: char| matches!(ch, ',' | ';' | '.')))
2008 {
2009 continue;
2010 }
2011 let Some(value) = parse_integer_token(raw) else {
2012 continue;
2013 };
2014 tokens.push(LayoutBarToken {
2015 bbox: word.bbox.clone(),
2016 value,
2017 text: sanitize_numberish_token(raw).unwrap_or_else(|| value.to_string()),
2018 });
2019 }
2020 }
2021
2022 let mut used = vec![false; tokens.len()];
2023 let mut values = Vec::with_capacity(anchors.len());
2024 for anchor in anchors {
2025 let anchor_center_x = anchor.bbox.center_x();
2026 let anchor_center_y = anchor.bbox.center_y();
2027 let best = tokens
2028 .iter()
2029 .enumerate()
2030 .filter(|(idx, token)| {
2031 !used[*idx]
2032 && token.bbox.center_y() > anchor_center_y + 8.0
2033 && (token.bbox.center_x() - anchor_center_x).abs() <= max_dx
2034 })
2035 .min_by(|left, right| {
2036 let left_score = (left.1.bbox.center_x() - anchor_center_x).abs()
2037 + (left.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
2038 let right_score = (right.1.bbox.center_x() - anchor_center_x).abs()
2039 + (right.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
2040 left_score
2041 .partial_cmp(&right_score)
2042 .unwrap_or(std::cmp::Ordering::Equal)
2043 });
2044 let Some((best_idx, token)) = best else {
2045 return Vec::new();
2046 };
2047 used[best_idx] = true;
2048 values.push(token.text.clone());
2049 }
2050
2051 values
2052}
2053
2054#[cfg(not(target_arch = "wasm32"))]
2055fn detect_layout_recommendation_infographic(
2056 page_width: f64,
2057 lines: &[BBoxLayoutLine],
2058) -> Option<LayoutRecommendationInfographic> {
2059 if page_width < 900.0 {
2060 return None;
2061 }
2062
2063 let blocks = collect_bbox_layout_blocks(lines);
2064 let page_top = lines
2065 .iter()
2066 .map(|line| line.bbox.top_y)
2067 .fold(0.0_f64, f64::max);
2068
2069 let title_block = blocks
2070 .iter()
2071 .filter(|block| {
2072 block.bbox.width() >= page_width * 0.55
2073 && block.bbox.top_y >= page_top - 105.0
2074 && bbox_layout_block_text(block).split_whitespace().count() >= 8
2075 })
2076 .max_by(|left, right| {
2077 left.bbox
2078 .width()
2079 .partial_cmp(&right.bbox.width())
2080 .unwrap_or(std::cmp::Ordering::Equal)
2081 })?;
2082 let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
2083 if title.split_whitespace().count() < 8 {
2084 return None;
2085 }
2086
2087 let eyebrow = blocks
2088 .iter()
2089 .filter(|block| {
2090 block.block_id != title_block.block_id
2091 && block.bbox.top_y > title_block.bbox.top_y
2092 && block.bbox.width() >= page_width * 0.1
2093 })
2094 .max_by(|left, right| {
2095 left.bbox
2096 .top_y
2097 .partial_cmp(&right.bbox.top_y)
2098 .unwrap_or(std::cmp::Ordering::Equal)
2099 })
2100 .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
2101
2102 let title_bottom = title_block.bbox.bottom_y;
2103 let region_width = page_width / 3.0;
2104 let left_panel = detect_layout_recommendation_hit_ratio_panel(
2105 &blocks,
2106 lines,
2107 0.0,
2108 region_width,
2109 title_bottom,
2110 )?;
2111 let middle_panel = detect_layout_recommendation_ranking_panel(
2112 &blocks,
2113 lines,
2114 region_width,
2115 region_width * 2.0,
2116 title_bottom,
2117 )?;
2118 let right_panel = detect_layout_recommendation_accuracy_panel(
2119 &blocks,
2120 lines,
2121 region_width * 2.0,
2122 page_width,
2123 title_bottom,
2124 )?;
2125
2126 Some(LayoutRecommendationInfographic {
2127 eyebrow,
2128 title,
2129 panels: vec![left_panel, middle_panel, right_panel],
2130 })
2131}
2132
2133#[cfg(not(target_arch = "wasm32"))]
2134#[allow(dead_code)]
2135fn render_layout_ocr_benchmark_dashboard_document(doc: &PdfDocument) -> Option<String> {
2136 let mut layout_cache = LayoutSourceCache::default();
2137 render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
2138}
2139
2140#[cfg(not(target_arch = "wasm32"))]
2141fn render_layout_ocr_benchmark_dashboard_document_cached(
2142 doc: &PdfDocument,
2143 layout_cache: &mut LayoutSourceCache,
2144) -> Option<String> {
2145 if doc.number_of_pages != 1 {
2146 return None;
2147 }
2148
2149 let layout = layout_cache.bbox_layout(doc)?;
2150 let dashboard = detect_layout_ocr_benchmark_dashboard(layout.page_width, &layout.lines)?;
2151
2152 let mut output = String::new();
2153 if let Some(eyebrow) = dashboard.eyebrow.as_deref() {
2154 output.push_str("## ");
2155 output.push_str(eyebrow.trim());
2156 output.push_str("\n\n");
2157 }
2158 output.push_str("# ");
2159 output.push_str(dashboard.title.trim());
2160 output.push_str("\n\n");
2161
2162 output.push_str("## ");
2163 output.push_str(dashboard.left_heading.trim());
2164 output.push_str("\n\n");
2165 let mut left_table = Vec::with_capacity(dashboard.left_rows.len() + 1);
2166 left_table.push({
2167 let mut row = vec!["Company".to_string()];
2168 row.extend(dashboard.left_columns.clone());
2169 row
2170 });
2171 left_table.extend(dashboard.left_rows.clone());
2172 output.push_str(&render_pipe_rows(&left_table));
2173
2174 output.push_str("## ");
2175 output.push_str(dashboard.right_heading.trim());
2176 output.push_str("\n\n");
2177 let mut right_table = Vec::with_capacity(dashboard.right_rows.len() + 1);
2178 right_table.push(vec![
2179 "Metric".to_string(),
2180 "Company A".to_string(),
2181 "Company B".to_string(),
2182 "upstage".to_string(),
2183 ]);
2184 right_table.extend(dashboard.right_rows.clone());
2185 output.push_str(&render_pipe_rows(&right_table));
2186
2187 if !dashboard.definition_notes.is_empty() {
2188 output.push_str("---\n\n");
2189 for note in &dashboard.definition_notes {
2190 output.push_str(note.trim());
2191 output.push_str("\n\n");
2192 }
2193 }
2194 if !dashboard.source_notes.is_empty() {
2195 output.push_str("---\n\n");
2196 for note in &dashboard.source_notes {
2197 output.push_str(note.trim());
2198 output.push_str("\n\n");
2199 }
2200 }
2201
2202 Some(output.trim_end().to_string() + "\n")
2203}
2204
2205#[cfg(not(target_arch = "wasm32"))]
2206fn detect_layout_ocr_benchmark_dashboard(
2207 page_width: f64,
2208 lines: &[BBoxLayoutLine],
2209) -> Option<LayoutOcrDashboard> {
2210 if page_width < 680.0 {
2211 return None;
2212 }
2213
2214 let page_mid = page_width / 2.0;
2215 let blocks = collect_bbox_layout_blocks(lines);
2216 let page_top = lines
2217 .iter()
2218 .map(|line| line.bbox.top_y)
2219 .fold(0.0_f64, f64::max);
2220
2221 let title_block = blocks
2222 .iter()
2223 .filter(|block| {
2224 block.bbox.width() >= page_width * 0.45 && block.bbox.top_y >= page_top - 40.0
2225 })
2226 .max_by(|left, right| {
2227 left.bbox
2228 .width()
2229 .partial_cmp(&right.bbox.width())
2230 .unwrap_or(std::cmp::Ordering::Equal)
2231 })?;
2232 let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
2233 if title.split_whitespace().count() < 5 {
2234 return None;
2235 }
2236
2237 let eyebrow = blocks
2238 .iter()
2239 .filter(|block| {
2240 block.block_id != title_block.block_id
2241 && block.bbox.top_y > title_block.bbox.top_y
2242 && block.bbox.width() >= page_width * 0.12
2243 })
2244 .max_by(|left, right| {
2245 left.bbox
2246 .top_y
2247 .partial_cmp(&right.bbox.top_y)
2248 .unwrap_or(std::cmp::Ordering::Equal)
2249 })
2250 .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
2251
2252 let left_title_blocks = blocks
2253 .iter()
2254 .filter(|block| {
2255 block.bbox.right_x <= page_mid
2256 && block.bbox.top_y < title_block.bbox.bottom_y - 25.0
2257 && block.bbox.top_y > title_block.bbox.bottom_y - 95.0
2258 && !bbox_layout_block_text(block)
2259 .chars()
2260 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
2261 })
2262 .cloned()
2263 .collect::<Vec<_>>();
2264 let right_title_blocks = blocks
2265 .iter()
2266 .filter(|block| {
2267 block.bbox.left_x >= page_mid
2268 && block.bbox.top_y < title_block.bbox.bottom_y - 25.0
2269 && block.bbox.top_y > title_block.bbox.bottom_y - 95.0
2270 && !bbox_layout_block_text(block)
2271 .chars()
2272 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
2273 })
2274 .cloned()
2275 .collect::<Vec<_>>();
2276
2277 let left_heading = join_dashboard_title_blocks(&left_title_blocks)?;
2278 let right_heading = join_dashboard_title_blocks(&right_title_blocks)?;
2279 if !left_heading.to_ascii_lowercase().contains("ocr")
2280 || !right_heading.to_ascii_lowercase().contains("document")
2281 {
2282 return None;
2283 }
2284
2285 let left_group_blocks = blocks
2286 .iter()
2287 .filter(|block| {
2288 block.bbox.center_x() < page_mid
2289 && block.bbox.top_y < 90.0
2290 && bbox_layout_block_text(block).contains('(')
2291 })
2292 .cloned()
2293 .collect::<Vec<_>>();
2294 if left_group_blocks.len() != 2 {
2295 return None;
2296 }
2297 let mut left_groups = left_group_blocks
2298 .iter()
2299 .map(|block| {
2300 (
2301 block.bbox.center_x(),
2302 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
2303 )
2304 })
2305 .collect::<Vec<_>>();
2306 left_groups.sort_by(|left, right| {
2307 left.0
2308 .partial_cmp(&right.0)
2309 .unwrap_or(std::cmp::Ordering::Equal)
2310 });
2311
2312 let left_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
2313 bbox.center_x() < page_mid - 20.0 && bbox.top_y > 110.0 && bbox.top_y < 250.0
2314 });
2315 if left_value_tokens.len() < 6 {
2316 return None;
2317 }
2318
2319 let mut left_group_values = vec![Vec::<(f64, String)>::new(), Vec::new()];
2320 for (bbox, value) in left_value_tokens {
2321 let group_idx = if (bbox.center_x() - left_groups[0].0).abs()
2322 <= (bbox.center_x() - left_groups[1].0).abs()
2323 {
2324 0
2325 } else {
2326 1
2327 };
2328 left_group_values[group_idx].push((bbox.center_x(), value));
2329 }
2330 if left_group_values.iter().any(|values| values.len() < 3) {
2331 return None;
2332 }
2333 for values in &mut left_group_values {
2334 values.sort_by(|left, right| {
2335 left.0
2336 .partial_cmp(&right.0)
2337 .unwrap_or(std::cmp::Ordering::Equal)
2338 });
2339 values.truncate(3);
2340 }
2341
2342 let mut company_labels = extract_dashboard_company_labels(&blocks, page_mid);
2343 if company_labels.len() < 2 {
2344 return None;
2345 }
2346 company_labels.truncate(2);
2347 company_labels.push(infer_dashboard_brand_name(&left_heading));
2348
2349 let mut left_rows = Vec::new();
2350 for row_idx in 0..3 {
2351 left_rows.push(vec![
2352 company_labels[row_idx].clone(),
2353 left_group_values[0][row_idx].1.clone(),
2354 left_group_values[1][row_idx].1.clone(),
2355 ]);
2356 }
2357
2358 let metric_blocks = blocks
2359 .iter()
2360 .filter(|block| {
2361 block.bbox.center_x() > page_mid
2362 && block.bbox.top_y > 95.0
2363 && block.bbox.top_y < 240.0
2364 && matches!(
2365 normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
2366 text if text.starts_with("ocr") || text.starts_with("parsingf1")
2367 )
2368 })
2369 .cloned()
2370 .collect::<Vec<_>>();
2371 if metric_blocks.len() < 4 {
2372 return None;
2373 }
2374
2375 let mut metrics = metric_blocks
2376 .iter()
2377 .map(|block| {
2378 (
2379 block.bbox.center_y(),
2380 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
2381 )
2382 })
2383 .collect::<Vec<_>>();
2384 metrics.sort_by(|left, right| {
2385 right
2386 .0
2387 .partial_cmp(&left.0)
2388 .unwrap_or(std::cmp::Ordering::Equal)
2389 });
2390 metrics.truncate(4);
2391
2392 let right_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
2393 bbox.center_x() > page_mid + 20.0 && bbox.top_y > 90.0 && bbox.top_y < 250.0
2394 });
2395 if right_value_tokens.len() < 10 {
2396 return None;
2397 }
2398
2399 let mut metric_values = vec![Vec::<(f64, String)>::new(); metrics.len()];
2400 for (bbox, value) in right_value_tokens {
2401 let Some((metric_idx, _)) = metrics
2402 .iter()
2403 .enumerate()
2404 .map(|(idx, (center_y, _))| (idx, (bbox.center_y() - *center_y).abs()))
2405 .min_by(|left, right| {
2406 left.1
2407 .partial_cmp(&right.1)
2408 .unwrap_or(std::cmp::Ordering::Equal)
2409 })
2410 else {
2411 continue;
2412 };
2413 metric_values[metric_idx].push((bbox.center_x(), value));
2414 }
2415
2416 let mut right_rows = Vec::new();
2417 for (idx, (_, metric_name)) in metrics.iter().enumerate() {
2418 let mut values = metric_values[idx].clone();
2419 values.sort_by(|left, right| {
2420 left.0
2421 .partial_cmp(&right.0)
2422 .unwrap_or(std::cmp::Ordering::Equal)
2423 });
2424 values.dedup_by(|left, right| left.1 == right.1);
2425 if values.len() < 2 {
2426 return None;
2427 }
2428 if values.len() == 2 {
2429 values.push(values[1].clone());
2430 }
2431 values.truncate(3);
2432 right_rows.push(vec![
2433 metric_name.clone(),
2434 normalize_layout_decimal_value(&values[0].1),
2435 normalize_layout_decimal_value(&values[1].1),
2436 normalize_layout_decimal_value(&values[2].1),
2437 ]);
2438 }
2439
2440 let definition_notes = collect_dashboard_notes(&blocks, page_mid, false);
2441 let source_notes = collect_dashboard_notes(&blocks, page_mid, true);
2442
2443 Some(LayoutOcrDashboard {
2444 eyebrow,
2445 title,
2446 left_heading,
2447 left_columns: left_groups.into_iter().map(|(_, text)| text).collect(),
2448 left_rows,
2449 right_heading,
2450 right_rows,
2451 definition_notes,
2452 source_notes,
2453 })
2454}
2455
2456#[cfg(not(target_arch = "wasm32"))]
2457fn detect_layout_recommendation_hit_ratio_panel(
2458 blocks: &[BBoxLayoutBlock],
2459 lines: &[BBoxLayoutLine],
2460 left_x: f64,
2461 right_x: f64,
2462 title_bottom: f64,
2463) -> Option<LayoutRecommendationPanel> {
2464 let (heading_block, subtitle_block) =
2465 extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2466 let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2467 let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2468 let width = right_x - left_x;
2469 let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2470
2471 let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2472 bbox.center_x() > left_x + width * 0.52
2473 && bbox.center_x() < right_x - 8.0
2474 && bbox.top_y < chart_cutoff
2475 });
2476 values.sort_by(|left, right| {
2477 right
2478 .0
2479 .center_y()
2480 .partial_cmp(&left.0.center_y())
2481 .unwrap_or(std::cmp::Ordering::Equal)
2482 });
2483 values.dedup_by(|left, right| {
2484 (left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
2485 });
2486 if values.len() < 4 {
2487 return None;
2488 }
2489
2490 let labels = collect_layout_panel_alpha_blocks(
2491 blocks,
2492 left_x,
2493 right_x,
2494 title_bottom,
2495 chart_cutoff,
2496 Some(left_x + width * 0.55),
2497 );
2498 let rows = pair_layout_decimal_rows(&labels, &values, 4)?;
2499 let notes = pair_layout_emphasis_notes(
2500 &rows,
2501 &collect_layout_emphasis_tokens(lines, |bbox| {
2502 bbox.center_x() > left_x + width * 0.48
2503 && bbox.center_x() < right_x
2504 && bbox.top_y < chart_cutoff
2505 }),
2506 "increase",
2507 );
2508 let metric_label =
2509 extract_layout_comparison_metric(&subtitle).unwrap_or_else(|| "Value".to_string());
2510
2511 Some(LayoutRecommendationPanel {
2512 heading,
2513 subtitle,
2514 header: vec!["Model".to_string(), metric_label],
2515 rows,
2516 notes,
2517 })
2518}
2519
2520#[cfg(not(target_arch = "wasm32"))]
2521fn detect_layout_recommendation_ranking_panel(
2522 blocks: &[BBoxLayoutBlock],
2523 lines: &[BBoxLayoutLine],
2524 left_x: f64,
2525 right_x: f64,
2526 title_bottom: f64,
2527) -> Option<LayoutRecommendationPanel> {
2528 let (heading_block, subtitle_block) =
2529 extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2530 let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2531 let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2532 let width = right_x - left_x;
2533 let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2534
2535 let row_labels = collect_layout_panel_alpha_blocks(
2536 blocks,
2537 left_x,
2538 right_x,
2539 title_bottom,
2540 chart_cutoff,
2541 Some(left_x + width * 0.48),
2542 )
2543 .into_iter()
2544 .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(&block)))
2545 .collect::<Vec<_>>();
2546 if row_labels.len() < 8 {
2547 return None;
2548 }
2549
2550 let headers = extract_layout_ranking_headers(blocks, left_x, right_x, chart_cutoff)
2551 .unwrap_or_else(|| vec!["Recall@10".to_string(), "Accuracy".to_string()]);
2552 let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2553 bbox.center_x() > left_x + width * 0.42
2554 && bbox.center_x() < right_x - 10.0
2555 && bbox.top_y < chart_cutoff
2556 });
2557 values.sort_by(|left, right| {
2558 left.0
2559 .left_x
2560 .partial_cmp(&right.0.left_x)
2561 .unwrap_or(std::cmp::Ordering::Equal)
2562 });
2563
2564 let mut rows = row_labels
2565 .into_iter()
2566 .map(|label| vec![label, String::new(), String::new()])
2567 .collect::<Vec<_>>();
2568 if let Some(first) = rows.first_mut() {
2569 if let Some((_, value)) = values.first() {
2570 first[1] = normalize_layout_decimal_value(value);
2571 }
2572 if let Some((_, value)) = values.get(1) {
2573 first[2] = normalize_layout_decimal_value(value);
2574 }
2575 }
2576
2577 let mut notes = collect_layout_ranking_notes(blocks, left_x, right_x, chart_cutoff);
2578 notes.extend(
2579 collect_layout_emphasis_tokens(lines, |bbox| {
2580 bbox.center_x() > left_x + width * 0.55
2581 && bbox.center_x() < right_x
2582 && bbox.top_y < chart_cutoff
2583 })
2584 .into_iter()
2585 .map(|(_, token)| format!("{} increase", token.trim_end_matches('↑'))),
2586 );
2587
2588 Some(LayoutRecommendationPanel {
2589 heading,
2590 subtitle,
2591 header: vec!["Method".to_string(), headers[0].clone(), headers[1].clone()],
2592 rows,
2593 notes,
2594 })
2595}
2596
2597#[cfg(not(target_arch = "wasm32"))]
2598fn detect_layout_recommendation_accuracy_panel(
2599 blocks: &[BBoxLayoutBlock],
2600 lines: &[BBoxLayoutLine],
2601 left_x: f64,
2602 right_x: f64,
2603 title_bottom: f64,
2604) -> Option<LayoutRecommendationPanel> {
2605 let (heading_block, subtitle_block) =
2606 extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2607 let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2608 let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2609 let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2610
2611 let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2612 bbox.center_x() > left_x + 20.0 && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
2613 });
2614 values.sort_by(|left, right| {
2615 right
2616 .0
2617 .center_y()
2618 .partial_cmp(&left.0.center_y())
2619 .unwrap_or(std::cmp::Ordering::Equal)
2620 });
2621 values.dedup_by(|left, right| {
2622 (left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
2623 });
2624 if values.len() < 2 {
2625 return None;
2626 }
2627 let min_value_top_y = values
2628 .iter()
2629 .map(|(bbox, _)| bbox.top_y)
2630 .fold(f64::INFINITY, f64::min);
2631
2632 let labels = collect_layout_panel_alpha_blocks(
2633 blocks,
2634 left_x,
2635 right_x,
2636 title_bottom,
2637 chart_cutoff,
2638 None,
2639 )
2640 .into_iter()
2641 .filter(|block| block.bbox.top_y < min_value_top_y - 70.0)
2642 .collect::<Vec<_>>();
2643 let rows = pair_layout_decimal_rows(&labels, &values, 2)?;
2644
2645 let mut notes = Vec::new();
2646 if let Some(description) = collect_layout_note_phrase(blocks, left_x, right_x, chart_cutoff) {
2647 if let Some((_, emphasis)) = collect_layout_emphasis_tokens(lines, |bbox| {
2648 bbox.center_x() > left_x && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
2649 })
2650 .into_iter()
2651 .next()
2652 {
2653 notes.push(format!(
2654 "{}, {} increase",
2655 description,
2656 emphasis.trim_end_matches('↑')
2657 ));
2658 }
2659 }
2660
2661 Some(LayoutRecommendationPanel {
2662 heading,
2663 subtitle,
2664 header: vec!["Model".to_string(), "Accuracy".to_string()],
2665 rows,
2666 notes,
2667 })
2668}
2669
2670#[cfg(not(target_arch = "wasm32"))]
2671fn extract_layout_panel_heading_and_subtitle(
2672 blocks: &[BBoxLayoutBlock],
2673 left_x: f64,
2674 right_x: f64,
2675 title_bottom: f64,
2676) -> Option<(BBoxLayoutBlock, BBoxLayoutBlock)> {
2677 let mut band_blocks = blocks
2678 .iter()
2679 .filter(|block| {
2680 block.bbox.center_x() >= left_x
2681 && block.bbox.center_x() <= right_x
2682 && block.bbox.top_y < title_bottom - 8.0
2683 && block.bbox.top_y > title_bottom - 90.0
2684 && bbox_layout_block_text(block)
2685 .chars()
2686 .any(char::is_alphabetic)
2687 })
2688 .cloned()
2689 .collect::<Vec<_>>();
2690 band_blocks.sort_by(|left, right| {
2691 right
2692 .bbox
2693 .top_y
2694 .partial_cmp(&left.bbox.top_y)
2695 .unwrap_or(std::cmp::Ordering::Equal)
2696 });
2697
2698 let heading = band_blocks.first()?.clone();
2699 let subtitle = band_blocks
2700 .iter()
2701 .find(|block| {
2702 block.block_id != heading.block_id
2703 && block.bbox.top_y < heading.bbox.bottom_y + 8.0
2704 && block.bbox.top_y > heading.bbox.bottom_y - 40.0
2705 })?
2706 .clone();
2707 Some((heading, subtitle))
2708}
2709
2710#[cfg(not(target_arch = "wasm32"))]
2711fn collect_layout_panel_alpha_blocks(
2712 blocks: &[BBoxLayoutBlock],
2713 left_x: f64,
2714 right_x: f64,
2715 title_bottom: f64,
2716 chart_cutoff: f64,
2717 max_left_x: Option<f64>,
2718) -> Vec<BBoxLayoutBlock> {
2719 let mut alpha_blocks = blocks
2720 .iter()
2721 .filter(|block| {
2722 block.bbox.center_x() >= left_x
2723 && block.bbox.center_x() <= right_x
2724 && block.bbox.top_y < chart_cutoff
2725 && block.bbox.top_y > title_bottom - 390.0
2726 && max_left_x.is_none_or(|limit| block.bbox.left_x <= limit)
2727 })
2728 .filter_map(|block| {
2729 let text = normalize_layout_panel_text(&bbox_layout_block_text(block));
2730 let token_count = text.split_whitespace().count();
2731 let has_alpha = text.chars().any(char::is_alphabetic);
2732 let has_numeric_marker = text
2733 .chars()
2734 .any(|ch| ch.is_ascii_digit() || ch == '%' || ch == ':');
2735 (has_alpha
2736 && token_count >= 1
2737 && !has_numeric_marker
2738 && !text.starts_with(':')
2739 && !text.eq_ignore_ascii_case("comparison"))
2740 .then_some(block.clone())
2741 })
2742 .collect::<Vec<_>>();
2743 alpha_blocks.sort_by(|left, right| {
2744 right
2745 .bbox
2746 .center_y()
2747 .partial_cmp(&left.bbox.center_y())
2748 .unwrap_or(std::cmp::Ordering::Equal)
2749 });
2750 alpha_blocks
2751}
2752
2753#[cfg(not(target_arch = "wasm32"))]
2754fn pair_layout_decimal_rows(
2755 label_blocks: &[BBoxLayoutBlock],
2756 value_tokens: &[(BoundingBox, String)],
2757 expected_len: usize,
2758) -> Option<Vec<Vec<String>>> {
2759 let mut used = HashSet::new();
2760 let mut rows = Vec::new();
2761
2762 for (bbox, value) in value_tokens.iter().take(expected_len) {
2763 let Some((label_idx, _)) = label_blocks
2764 .iter()
2765 .enumerate()
2766 .filter(|(idx, block)| {
2767 !used.contains(idx) && block.bbox.center_x() <= bbox.center_x() + 24.0
2768 })
2769 .map(|(idx, block)| (idx, (block.bbox.center_y() - bbox.center_y()).abs()))
2770 .min_by(|left, right| {
2771 left.1
2772 .partial_cmp(&right.1)
2773 .unwrap_or(std::cmp::Ordering::Equal)
2774 })
2775 else {
2776 continue;
2777 };
2778 if label_blocks[label_idx].bbox.center_y() - bbox.center_y() > 30.0 {
2779 continue;
2780 }
2781
2782 used.insert(label_idx);
2783 rows.push(vec![
2784 normalize_layout_panel_text(&bbox_layout_block_text(&label_blocks[label_idx])),
2785 normalize_layout_decimal_value(value),
2786 ]);
2787 }
2788
2789 (rows.len() >= expected_len).then_some(rows)
2790}
2791
2792#[cfg(not(target_arch = "wasm32"))]
2793fn collect_layout_emphasis_tokens<F>(
2794 lines: &[BBoxLayoutLine],
2795 bbox_filter: F,
2796) -> Vec<(BoundingBox, String)>
2797where
2798 F: Fn(&BoundingBox) -> bool,
2799{
2800 let emphasis_re = Regex::new(r"^\d+(?:\.\d+)?(?:X|%)↑?$").ok();
2801 let Some(emphasis_re) = emphasis_re else {
2802 return Vec::new();
2803 };
2804
2805 let mut tokens = Vec::new();
2806 for line in lines {
2807 for word in &line.words {
2808 let candidate = word.text.trim();
2809 if bbox_filter(&word.bbox) && emphasis_re.is_match(candidate) {
2810 tokens.push((word.bbox.clone(), candidate.to_string()));
2811 }
2812 }
2813 }
2814 tokens.sort_by(|left, right| {
2815 right
2816 .0
2817 .center_y()
2818 .partial_cmp(&left.0.center_y())
2819 .unwrap_or(std::cmp::Ordering::Equal)
2820 });
2821 tokens
2822}
2823
2824#[cfg(not(target_arch = "wasm32"))]
2825fn pair_layout_emphasis_notes(
2826 rows: &[Vec<String>],
2827 emphasis_tokens: &[(BoundingBox, String)],
2828 suffix: &str,
2829) -> Vec<String> {
2830 let mut notes = Vec::new();
2831 for ((_, token), row) in emphasis_tokens.iter().zip(rows.iter().skip(2)) {
2832 if let Some(label) = row.first() {
2833 notes.push(format!(
2834 "{}: {} {}",
2835 label.trim(),
2836 token.trim_end_matches('↑'),
2837 suffix
2838 ));
2839 }
2840 }
2841 notes
2842}
2843
2844#[cfg(not(target_arch = "wasm32"))]
2845fn extract_layout_comparison_metric(text: &str) -> Option<String> {
2846 let tokens = text.split_whitespace().collect::<Vec<_>>();
2847 let comparison_idx = tokens
2848 .iter()
2849 .position(|token| token.eq_ignore_ascii_case("comparison"))?;
2850 if comparison_idx < 2 {
2851 return None;
2852 }
2853 let metric = tokens[comparison_idx.saturating_sub(2)..comparison_idx].join(" ");
2854 (!metric.trim().is_empty()).then_some(metric)
2855}
2856
2857#[cfg(not(target_arch = "wasm32"))]
2858fn title_case_metric_label(text: &str) -> String {
2859 let trimmed = text.trim();
2860 if trimmed.is_empty() {
2861 return String::new();
2862 }
2863 let mut out = String::new();
2864 for (idx, token) in trimmed.split_whitespace().enumerate() {
2865 if idx > 0 {
2866 out.push(' ');
2867 }
2868 if token
2869 .chars()
2870 .all(|ch| !ch.is_ascii_alphabetic() || ch.is_uppercase())
2871 {
2872 out.push_str(token);
2873 } else {
2874 let mut chars = token.chars();
2875 if let Some(first) = chars.next() {
2876 out.push(first.to_ascii_uppercase());
2877 for ch in chars {
2878 out.push(ch);
2879 }
2880 }
2881 }
2882 }
2883 out
2884}
2885
2886#[cfg(not(target_arch = "wasm32"))]
2887fn normalize_layout_panel_text(text: &str) -> String {
2888 normalize_layout_dashboard_text(text)
2889 .replace(" _", "_")
2890 .replace("_ ", "_")
2891}
2892
2893#[cfg(not(target_arch = "wasm32"))]
2894fn extract_layout_ranking_headers(
2895 blocks: &[BBoxLayoutBlock],
2896 left_x: f64,
2897 right_x: f64,
2898 chart_cutoff: f64,
2899) -> Option<Vec<String>> {
2900 let legend = blocks
2901 .iter()
2902 .filter(|block| {
2903 block.bbox.center_x() >= left_x
2904 && block.bbox.center_x() <= right_x
2905 && block.bbox.top_y < chart_cutoff
2906 && bbox_layout_block_text(block).contains(':')
2907 })
2908 .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
2909 .collect::<Vec<_>>();
2910 for line in legend {
2911 let segments = line
2912 .split(':')
2913 .map(str::trim)
2914 .filter(|segment| !segment.is_empty())
2915 .collect::<Vec<_>>();
2916 let Some(first_segment) = segments.first() else {
2917 continue;
2918 };
2919 let metrics = first_segment
2920 .split(',')
2921 .map(title_case_metric_label)
2922 .filter(|part| !part.trim().is_empty())
2923 .collect::<Vec<_>>();
2924 if metrics.len() >= 2 {
2925 return Some(vec![metrics[0].clone(), metrics[1].clone()]);
2926 }
2927 }
2928 None
2929}
2930
2931#[cfg(not(target_arch = "wasm32"))]
2932fn collect_layout_ranking_notes(
2933 blocks: &[BBoxLayoutBlock],
2934 left_x: f64,
2935 right_x: f64,
2936 chart_cutoff: f64,
2937) -> Vec<String> {
2938 blocks
2939 .iter()
2940 .filter(|block| {
2941 block.bbox.center_x() >= left_x
2942 && block.bbox.center_x() <= right_x
2943 && block.bbox.top_y < chart_cutoff
2944 && bbox_layout_block_text(block).contains(':')
2945 })
2946 .flat_map(|block| {
2947 normalize_layout_panel_text(&bbox_layout_block_text(block))
2948 .split(':')
2949 .map(str::trim)
2950 .filter(|segment| !segment.is_empty())
2951 .map(ToString::to_string)
2952 .collect::<Vec<_>>()
2953 })
2954 .filter(|note| !note.eq_ignore_ascii_case("recall@10, accuracy"))
2955 .collect()
2956}
2957
2958#[cfg(not(target_arch = "wasm32"))]
2959fn collect_layout_note_phrase(
2960 blocks: &[BBoxLayoutBlock],
2961 left_x: f64,
2962 right_x: f64,
2963 chart_cutoff: f64,
2964) -> Option<String> {
2965 blocks
2966 .iter()
2967 .filter(|block| {
2968 block.bbox.center_x() >= left_x
2969 && block.bbox.center_x() <= right_x
2970 && block.bbox.top_y < chart_cutoff
2971 && bbox_layout_block_text(block).split_whitespace().count() >= 3
2972 })
2973 .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
2974 .find(|text| text.to_ascii_lowercase().contains("compared"))
2975}
2976
2977#[cfg(not(target_arch = "wasm32"))]
2978fn collect_bbox_layout_blocks(lines: &[BBoxLayoutLine]) -> Vec<BBoxLayoutBlock> {
2979 let mut grouped: HashMap<usize, Vec<BBoxLayoutLine>> = HashMap::new();
2980 for line in lines {
2981 grouped.entry(line.block_id).or_default().push(line.clone());
2982 }
2983
2984 let mut blocks = grouped
2985 .into_iter()
2986 .map(|(block_id, mut lines)| {
2987 lines.sort_by(|left, right| {
2988 cmp_banded_reading_order(&left.bbox, &right.bbox, 3.0)
2989 .then_with(|| left.block_id.cmp(&right.block_id))
2990 });
2991 let bbox = lines
2992 .iter()
2993 .skip(1)
2994 .fold(lines[0].bbox.clone(), |acc, line| acc.union(&line.bbox));
2995 BBoxLayoutBlock {
2996 block_id,
2997 bbox,
2998 lines,
2999 }
3000 })
3001 .collect::<Vec<_>>();
3002 blocks.sort_by(|left, right| {
3003 cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
3004 .then_with(|| left.block_id.cmp(&right.block_id))
3005 });
3006 blocks
3007}
3008
3009#[cfg(not(target_arch = "wasm32"))]
3010fn bbox_layout_block_text(block: &BBoxLayoutBlock) -> String {
3011 join_layout_lines_as_paragraph(&block.lines.iter().collect::<Vec<_>>())
3012}
3013
3014#[cfg(not(target_arch = "wasm32"))]
3015fn join_dashboard_title_blocks(blocks: &[BBoxLayoutBlock]) -> Option<String> {
3016 let mut blocks = blocks.to_vec();
3017 blocks.sort_by(|left, right| {
3018 right
3019 .bbox
3020 .top_y
3021 .partial_cmp(&left.bbox.top_y)
3022 .unwrap_or(std::cmp::Ordering::Equal)
3023 });
3024 let text = blocks
3025 .iter()
3026 .map(bbox_layout_block_text)
3027 .filter(|text| !text.trim().is_empty())
3028 .collect::<Vec<_>>()
3029 .join(" ");
3030 let normalized = normalize_layout_dashboard_text(&text);
3031 (!normalized.trim().is_empty()).then_some(normalized)
3032}
3033
3034#[cfg(not(target_arch = "wasm32"))]
3035fn collect_layout_decimal_tokens<F>(
3036 lines: &[BBoxLayoutLine],
3037 bbox_filter: F,
3038) -> Vec<(BoundingBox, String)>
3039where
3040 F: Fn(&BoundingBox) -> bool,
3041{
3042 let decimal_re = Regex::new(r"^\d+\.\d+$|^\d+\.$").ok();
3043 let Some(decimal_re) = decimal_re else {
3044 return Vec::new();
3045 };
3046
3047 let mut tokens = Vec::new();
3048 for line in lines {
3049 for word in &line.words {
3050 let candidate = word.text.trim().trim_matches(|ch| ch == ',' || ch == ';');
3051 if !bbox_filter(&word.bbox) || !decimal_re.is_match(candidate) {
3052 continue;
3053 }
3054 tokens.push((word.bbox.clone(), candidate.to_string()));
3055 }
3056 }
3057 tokens
3058}
3059
3060#[cfg(not(target_arch = "wasm32"))]
3061fn extract_dashboard_company_labels(blocks: &[BBoxLayoutBlock], page_mid: f64) -> Vec<String> {
3062 let company_blocks = blocks
3063 .iter()
3064 .filter(|block| {
3065 block.bbox.center_x() < page_mid
3066 && (65.0..110.0).contains(&block.bbox.top_y)
3067 && bbox_layout_block_text(block) == "Company"
3068 })
3069 .collect::<Vec<_>>();
3070 let marker_blocks = blocks
3071 .iter()
3072 .filter(|block| {
3073 block.bbox.center_x() < page_mid
3074 && (60.0..105.0).contains(&block.bbox.top_y)
3075 && matches!(
3076 normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
3077 "a2" | "b2"
3078 )
3079 })
3080 .map(|block| {
3081 (
3082 block.bbox.center_x(),
3083 block.bbox.center_y(),
3084 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3085 )
3086 })
3087 .collect::<Vec<_>>();
3088
3089 let mut labels = Vec::new();
3090 for company in company_blocks {
3091 if let Some((_, marker_y, marker)) = marker_blocks.iter().min_by(|left, right| {
3092 let left_distance = ((left.0 - company.bbox.center_x()).powi(2)
3093 + (left.1 - company.bbox.center_y()).powi(2))
3094 .sqrt();
3095 let right_distance = ((right.0 - company.bbox.center_x()).powi(2)
3096 + (right.1 - company.bbox.center_y()).powi(2))
3097 .sqrt();
3098 left_distance
3099 .partial_cmp(&right_distance)
3100 .unwrap_or(std::cmp::Ordering::Equal)
3101 }) {
3102 if (company.bbox.center_y() - *marker_y).abs() <= 16.0 || marker_blocks.len() == 1 {
3103 labels.push(format!("{} {}", bbox_layout_block_text(company), marker));
3104 }
3105 }
3106 }
3107
3108 if labels.len() < 2 {
3109 labels.extend(
3110 marker_blocks
3111 .iter()
3112 .map(|(_, _, marker)| format!("Company {marker}")),
3113 );
3114 }
3115
3116 labels.sort();
3117 labels.dedup();
3118 labels
3119}
3120
3121#[cfg(not(target_arch = "wasm32"))]
3122fn infer_dashboard_brand_name(text: &str) -> String {
3123 text.split_whitespace()
3124 .next()
3125 .map(|token| token.trim_matches(|ch: char| !ch.is_alphanumeric()))
3126 .filter(|token| !token.is_empty())
3127 .map(|token| token.to_ascii_lowercase())
3128 .unwrap_or_else(|| "model".to_string())
3129}
3130
3131#[cfg(not(target_arch = "wasm32"))]
3132fn collect_dashboard_notes(
3133 blocks: &[BBoxLayoutBlock],
3134 page_mid: f64,
3135 left_half: bool,
3136) -> Vec<String> {
3137 let notes = blocks
3138 .iter()
3139 .filter(|block| {
3140 let in_half = if left_half {
3141 block.bbox.center_x() < page_mid
3142 } else {
3143 block.bbox.center_x() > page_mid
3144 };
3145 in_half && block.bbox.top_y < 50.0
3146 })
3147 .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
3148 .filter(|text| !text.trim().is_empty())
3149 .collect::<Vec<_>>();
3150
3151 let mut merged = Vec::new();
3152 for note in notes {
3153 if note
3154 .chars()
3155 .next()
3156 .is_some_and(|ch| matches!(ch, '¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹'))
3157 {
3158 merged.push(note);
3159 } else if let Some(previous) = merged.last_mut() {
3160 append_cell_text(previous, ¬e);
3161 } else {
3162 merged.push(note);
3163 }
3164 }
3165 merged
3166}
3167
3168#[cfg(not(target_arch = "wasm32"))]
3169fn normalize_layout_dashboard_text(text: &str) -> String {
3170 let normalized = normalize_common_ocr_text(text.trim());
3171 let degree_marker_re = Regex::new(r"(\d)[°º]").ok();
3172 let split_suffix_re = Regex::new(r"\b([A-Za-z])(\d)\s+(\d)\b").ok();
3173 let single_letter_marker_re = Regex::new(r"\b([A-Za-z])\s+(\d{1,2})\b").ok();
3174 let trailing_block_marker_re = Regex::new(r"([A-Za-z][A-Za-z0-9\-]*)\s+(\d{1,2})$").ok();
3175 let trailing_marker_re = Regex::new(r"([[:alpha:]\)])(\d{1,2})\b").ok();
3176 let leading_marker_re = Regex::new(r"^(\d{1,2})([.)]?)\s+").ok();
3177
3178 let cleaned_degree = degree_marker_re
3179 .as_ref()
3180 .map(|re| {
3181 re.replace_all(&normalized, |captures: ®ex::Captures<'_>| {
3182 format!("{} ", &captures[1])
3183 })
3184 .to_string()
3185 })
3186 .unwrap_or(normalized);
3187
3188 let collapsed_suffix = split_suffix_re
3189 .as_ref()
3190 .map(|re| {
3191 re.replace_all(&cleaned_degree, |captures: ®ex::Captures<'_>| {
3192 format!("{}{}{}", &captures[1], &captures[2], &captures[3])
3193 })
3194 .to_string()
3195 })
3196 .unwrap_or(cleaned_degree);
3197
3198 let collapsed_spacing = single_letter_marker_re
3199 .as_ref()
3200 .map(|re| {
3201 re.replace_all(&collapsed_suffix, |captures: ®ex::Captures<'_>| {
3202 format!("{}{}", &captures[1], &captures[2])
3203 })
3204 .to_string()
3205 })
3206 .unwrap_or(collapsed_suffix);
3207
3208 let collapsed_terminal_marker = trailing_block_marker_re
3209 .as_ref()
3210 .map(|re| {
3211 re.replace(&collapsed_spacing, |captures: ®ex::Captures<'_>| {
3212 format!("{}{}", &captures[1], &captures[2])
3213 })
3214 .to_string()
3215 })
3216 .unwrap_or(collapsed_spacing);
3217
3218 let with_inline = trailing_marker_re
3219 .as_ref()
3220 .map(|re| {
3221 re.replace_all(
3222 &collapsed_terminal_marker,
3223 |captures: ®ex::Captures<'_>| {
3224 format!("{}{}", &captures[1], superscript_digits(&captures[2]))
3225 },
3226 )
3227 .to_string()
3228 })
3229 .unwrap_or(collapsed_terminal_marker);
3230
3231 leading_marker_re
3232 .as_ref()
3233 .map(|re| {
3234 re.replace(&with_inline, |captures: ®ex::Captures<'_>| {
3235 format!("{} ", superscript_digits(&captures[1]))
3236 })
3237 .to_string()
3238 })
3239 .unwrap_or(with_inline)
3240}
3241
3242#[cfg(not(target_arch = "wasm32"))]
3243fn normalize_layout_decimal_value(value: &str) -> String {
3244 value.trim_end_matches('.').to_string()
3245}
3246
3247#[cfg(not(target_arch = "wasm32"))]
3248fn superscript_digits(text: &str) -> String {
3249 text.chars()
3250 .map(|ch| match ch {
3251 '0' => '⁰',
3252 '1' => '¹',
3253 '2' => '²',
3254 '3' => '³',
3255 '4' => '⁴',
3256 '5' => '⁵',
3257 '6' => '⁶',
3258 '7' => '⁷',
3259 '8' => '⁸',
3260 '9' => '⁹',
3261 _ => ch,
3262 })
3263 .collect()
3264}
3265
3266#[cfg(not(target_arch = "wasm32"))]
3267fn collect_layout_figure_captions(blocks: &[BBoxLayoutBlock]) -> Vec<BBoxLayoutBlock> {
3268 let mut captions = blocks
3269 .iter()
3270 .filter(|block| {
3271 let text = bbox_layout_block_text(block);
3272 text.starts_with("Figure ")
3273 && text.contains(':')
3274 && text.split_whitespace().count() >= 8
3275 })
3276 .cloned()
3277 .collect::<Vec<_>>();
3278 captions.sort_by(|left, right| {
3279 right
3280 .bbox
3281 .top_y
3282 .partial_cmp(&left.bbox.top_y)
3283 .unwrap_or(std::cmp::Ordering::Equal)
3284 });
3285 captions
3286}
3287
3288#[cfg(not(target_arch = "wasm32"))]
3289fn collect_layout_integer_tokens<F>(lines: &[BBoxLayoutLine], bbox_filter: F) -> Vec<LayoutBarToken>
3290where
3291 F: Fn(&BoundingBox) -> bool,
3292{
3293 let integer_re = Regex::new(r"^\d+$").ok();
3294 let Some(integer_re) = integer_re else {
3295 return Vec::new();
3296 };
3297
3298 let mut tokens = Vec::new();
3299 for line in lines {
3300 for word in &line.words {
3301 let candidate = word.text.trim();
3302 if !bbox_filter(&word.bbox) || !integer_re.is_match(candidate) {
3303 continue;
3304 }
3305 let Ok(value) = candidate.parse::<i64>() else {
3306 continue;
3307 };
3308 tokens.push(LayoutBarToken {
3309 bbox: word.bbox.clone(),
3310 value,
3311 text: candidate.to_string(),
3312 });
3313 }
3314 }
3315 tokens
3316}
3317
3318#[cfg(not(target_arch = "wasm32"))]
3319fn detect_layout_three_month_stacked_figure(
3320 blocks: &[BBoxLayoutBlock],
3321 lines: &[BBoxLayoutLine],
3322 page_width: f64,
3323 caption_block: BBoxLayoutBlock,
3324 next_caption_top_y: f64,
3325) -> Option<LayoutStackedBarFigure> {
3326 let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
3327 let month_blocks = collect_layout_month_blocks(
3328 blocks,
3329 caption_block.bbox.bottom_y - 150.0,
3330 caption_block.bbox.bottom_y - 230.0,
3331 None,
3332 );
3333 if month_blocks.len() != 3 {
3334 return None;
3335 }
3336 let legend_blocks = collect_layout_legend_blocks(
3337 blocks,
3338 caption_block.bbox.bottom_y - 175.0,
3339 caption_block.bbox.bottom_y - 220.0,
3340 );
3341 if legend_blocks.len() != 3 {
3342 return None;
3343 }
3344
3345 let month_centers = month_blocks
3346 .iter()
3347 .map(|block| {
3348 (
3349 block.bbox.center_x(),
3350 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3351 )
3352 })
3353 .collect::<Vec<_>>();
3354 let month_top_y = month_blocks
3355 .iter()
3356 .map(|block| block.bbox.top_y)
3357 .fold(0.0_f64, f64::max);
3358 let first_center = month_centers.first()?.0;
3359 let last_center = month_centers.last()?.0;
3360 let tokens = collect_layout_integer_tokens(lines, |bbox| {
3361 bbox.center_x() >= first_center - 20.0
3362 && bbox.center_x() <= last_center + 20.0
3363 && bbox.center_y() > month_top_y + 10.0
3364 && bbox.top_y < caption_block.bbox.bottom_y - 25.0
3365 && bbox.bottom_y > next_caption_top_y + 55.0
3366 && bbox.left_x > page_width * 0.28
3367 });
3368 if tokens.len() < 9 {
3369 return None;
3370 }
3371
3372 let mut grouped = vec![Vec::<LayoutBarToken>::new(), Vec::new(), Vec::new()];
3373 for token in tokens {
3374 let Some((idx, distance)) = month_centers
3375 .iter()
3376 .enumerate()
3377 .map(|(idx, (center_x, _))| (idx, (token.bbox.center_x() - *center_x).abs()))
3378 .min_by(|left, right| {
3379 left.1
3380 .partial_cmp(&right.1)
3381 .unwrap_or(std::cmp::Ordering::Equal)
3382 })
3383 else {
3384 continue;
3385 };
3386 if distance <= 28.0 {
3387 grouped[idx].push(token);
3388 }
3389 }
3390 if grouped.iter().any(|bucket| bucket.len() < 3) {
3391 return None;
3392 }
3393
3394 let mut rows = vec![
3395 vec![legend_blocks[0].1.clone()],
3396 vec![legend_blocks[1].1.clone()],
3397 vec![legend_blocks[2].1.clone()],
3398 ];
3399 for bucket in &mut grouped {
3400 bucket.sort_by(|left, right| {
3401 left.bbox
3402 .center_y()
3403 .partial_cmp(&right.bbox.center_y())
3404 .unwrap_or(std::cmp::Ordering::Equal)
3405 });
3406 bucket.truncate(3);
3407 rows[0].push(bucket[0].value.to_string());
3408 rows[1].push(bucket[1].value.to_string());
3409 rows[2].push(bucket[2].value.to_string());
3410 }
3411
3412 Some(LayoutStackedBarFigure {
3413 caption,
3414 months: month_centers.into_iter().map(|(_, text)| text).collect(),
3415 row_labels: legend_blocks.iter().map(|(_, text)| text.clone()).collect(),
3416 rows,
3417 })
3418}
3419
3420#[cfg(not(target_arch = "wasm32"))]
3421fn detect_layout_sector_bar_figure(
3422 blocks: &[BBoxLayoutBlock],
3423 lines: &[BBoxLayoutLine],
3424 page_width: f64,
3425 caption_block: BBoxLayoutBlock,
3426 narrative_top_y: f64,
3427) -> Option<LayoutStackedBarSectorFigure> {
3428 let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
3429 let month_blocks = collect_layout_month_blocks(
3430 blocks,
3431 caption_block.bbox.bottom_y - 160.0,
3432 caption_block.bbox.bottom_y - 235.0,
3433 Some(page_width * 0.22),
3434 );
3435 if month_blocks.len() != 9 {
3436 return None;
3437 }
3438 let sector_blocks = blocks
3439 .iter()
3440 .filter(|block| {
3441 let text = bbox_layout_block_text(block);
3442 block.bbox.top_y < caption_block.bbox.bottom_y - 150.0
3443 && block.bbox.top_y > caption_block.bbox.bottom_y - 220.0
3444 && text.split_whitespace().count() <= 2
3445 && text.len() >= 7
3446 && !looks_like_layout_month_label(&text)
3447 && !text.starts_with("Will ")
3448 && text != "Don’t know"
3449 })
3450 .map(|block| {
3451 (
3452 block.bbox.center_x(),
3453 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3454 )
3455 })
3456 .collect::<Vec<_>>();
3457 if sector_blocks.len() != 3 {
3458 return None;
3459 }
3460
3461 let month_centers = month_blocks
3462 .iter()
3463 .map(|block| block.bbox.center_x())
3464 .collect::<Vec<_>>();
3465 let month_top_y = month_blocks
3466 .iter()
3467 .map(|block| block.bbox.top_y)
3468 .fold(0.0_f64, f64::max);
3469 let first_center = *month_centers.first()?;
3470 let last_center = *month_centers.last()?;
3471 let tokens = collect_layout_integer_tokens(lines, |bbox| {
3472 bbox.center_x() >= first_center - 12.0
3473 && bbox.center_x() <= last_center + 12.0
3474 && bbox.center_y() > month_top_y + 10.0
3475 && bbox.top_y < caption_block.bbox.bottom_y - 20.0
3476 && bbox.bottom_y > narrative_top_y + 55.0
3477 && bbox.left_x > page_width * 0.24
3478 });
3479 if tokens.len() < 18 {
3480 return None;
3481 }
3482
3483 let mut grouped = vec![Vec::<LayoutBarToken>::new(); 9];
3484 for token in tokens {
3485 let Some((idx, distance)) = month_centers
3486 .iter()
3487 .enumerate()
3488 .map(|(idx, center_x)| (idx, (token.bbox.center_x() - *center_x).abs()))
3489 .min_by(|left, right| {
3490 left.1
3491 .partial_cmp(&right.1)
3492 .unwrap_or(std::cmp::Ordering::Equal)
3493 })
3494 else {
3495 continue;
3496 };
3497 if distance <= 18.0 {
3498 grouped[idx].push(token);
3499 }
3500 }
3501 if grouped.iter().any(|bucket| bucket.is_empty()) {
3502 return None;
3503 }
3504
3505 let months = vec![
3506 "July 2020".to_string(),
3507 "October 2020".to_string(),
3508 "January 2021".to_string(),
3509 ];
3510 let mut rows = Vec::new();
3511 for (sector_idx, (_, sector_name)) in sector_blocks.iter().enumerate() {
3512 let mut row = vec![sector_name.clone()];
3513 for month_idx in 0..3 {
3514 let bucket = &mut grouped[sector_idx * 3 + month_idx];
3515 bucket.sort_by(|left, right| {
3516 left.bbox
3517 .center_y()
3518 .partial_cmp(&right.bbox.center_y())
3519 .unwrap_or(std::cmp::Ordering::Equal)
3520 });
3521 row.push(bucket.first()?.value.to_string());
3522 }
3523 rows.push(row);
3524 }
3525
3526 Some(LayoutStackedBarSectorFigure {
3527 caption,
3528 months,
3529 sectors: sector_blocks.into_iter().map(|(_, name)| name).collect(),
3530 rows,
3531 })
3532}
3533
3534#[cfg(not(target_arch = "wasm32"))]
3535fn detect_layout_stacked_bar_narrative(
3536 blocks: &[BBoxLayoutBlock],
3537) -> Option<LayoutStackedBarNarrative> {
3538 let heading_block = blocks.iter().find(|block| {
3539 let text = bbox_layout_block_text(block);
3540 text.starts_with("6.") && text.contains("Expectations") && text.contains("Employees")
3541 })?;
3542 let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(heading_block));
3543
3544 let left_blocks = blocks
3545 .iter()
3546 .filter(|block| {
3547 block.bbox.top_y <= heading_block.bbox.top_y + 2.0
3548 && block.bbox.bottom_y > 80.0
3549 && block.bbox.right_x < 330.0
3550 && block.bbox.left_x > 80.0
3551 && block.block_id != heading_block.block_id
3552 && !bbox_layout_block_text(block).starts_with("5.")
3553 })
3554 .collect::<Vec<_>>();
3555 let right_blocks = blocks
3556 .iter()
3557 .filter(|block| {
3558 block.bbox.top_y <= heading_block.bbox.top_y + 2.0
3559 && block.bbox.bottom_y > 80.0
3560 && block.bbox.left_x > 320.0
3561 && block.block_id != heading_block.block_id
3562 && !bbox_layout_block_text(block).starts_with("5.")
3563 })
3564 .collect::<Vec<_>>();
3565 if left_blocks.is_empty() || right_blocks.is_empty() {
3566 return None;
3567 }
3568
3569 let mut ordered_blocks = left_blocks;
3570 ordered_blocks.extend(right_blocks);
3571 ordered_blocks.sort_by(|left, right| {
3572 let left_column = left.bbox.left_x > 320.0;
3573 let right_column = right.bbox.left_x > 320.0;
3574 if left_column != right_column {
3575 return left_column.cmp(&right_column);
3576 }
3577 right
3578 .bbox
3579 .top_y
3580 .partial_cmp(&left.bbox.top_y)
3581 .unwrap_or(std::cmp::Ordering::Equal)
3582 });
3583
3584 let ordered_lines = ordered_blocks
3585 .iter()
3586 .flat_map(|block| block.lines.iter())
3587 .collect::<Vec<_>>();
3588 let mut paragraph_lines: Vec<Vec<&BBoxLayoutLine>> = Vec::new();
3589 let mut current: Vec<&BBoxLayoutLine> = Vec::new();
3590 let mut previous_text = String::new();
3591 for line in ordered_lines {
3592 let line_text = bbox_layout_line_text(line);
3593 let trimmed = line_text.trim();
3594 if trimmed.is_empty() {
3595 continue;
3596 }
3597
3598 let starts_new_paragraph = !current.is_empty()
3599 && starts_with_uppercase_word(trimmed)
3600 && looks_like_sentence_end(&previous_text);
3601 if starts_new_paragraph {
3602 paragraph_lines.push(std::mem::take(&mut current));
3603 }
3604 current.push(line);
3605 previous_text = trimmed.to_string();
3606 }
3607 if !current.is_empty() {
3608 paragraph_lines.push(current);
3609 }
3610
3611 let paragraphs = paragraph_lines
3612 .iter()
3613 .map(|lines| normalize_layout_dashboard_text(&join_layout_lines_as_paragraph(lines)))
3614 .filter(|text| text.split_whitespace().count() >= 12)
3615 .collect::<Vec<_>>();
3616 if paragraphs.len() < 2 {
3617 return None;
3618 }
3619
3620 let footnote = blocks
3621 .iter()
3622 .filter(|block| {
3623 let text = bbox_layout_block_text(block);
3624 block.bbox.bottom_y < 120.0 && text.starts_with("5.")
3625 })
3626 .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
3627 .next();
3628
3629 Some(LayoutStackedBarNarrative {
3630 heading,
3631 paragraphs,
3632 footnote,
3633 top_y: heading_block.bbox.top_y,
3634 })
3635}
3636
3637#[cfg(not(target_arch = "wasm32"))]
3638fn collect_layout_month_blocks(
3639 blocks: &[BBoxLayoutBlock],
3640 top_min: f64,
3641 top_max: f64,
3642 min_left_x: Option<f64>,
3643) -> Vec<BBoxLayoutBlock> {
3644 let mut month_blocks = blocks
3645 .iter()
3646 .filter(|block| {
3647 let text = bbox_layout_block_text(block);
3648 let left_ok = min_left_x.is_none_or(|min_left_x| block.bbox.left_x >= min_left_x);
3649 left_ok
3650 && block.bbox.top_y <= top_min
3651 && block.bbox.top_y >= top_max
3652 && looks_like_layout_month_label(&text)
3653 })
3654 .cloned()
3655 .collect::<Vec<_>>();
3656 month_blocks.sort_by(|left, right| {
3657 left.bbox
3658 .center_x()
3659 .partial_cmp(&right.bbox.center_x())
3660 .unwrap_or(std::cmp::Ordering::Equal)
3661 });
3662 month_blocks
3663}
3664
3665#[cfg(not(target_arch = "wasm32"))]
3666fn collect_layout_legend_blocks(
3667 blocks: &[BBoxLayoutBlock],
3668 top_min: f64,
3669 top_max: f64,
3670) -> Vec<(f64, String)> {
3671 let mut legend_blocks = blocks
3672 .iter()
3673 .filter(|block| {
3674 let text = bbox_layout_block_text(block);
3675 block.bbox.top_y <= top_min
3676 && block.bbox.top_y >= top_max
3677 && (text.starts_with("Will ") || text == "Don’t know")
3678 })
3679 .map(|block| {
3680 (
3681 block.bbox.center_x(),
3682 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3683 )
3684 })
3685 .collect::<Vec<_>>();
3686 legend_blocks.sort_by(|left, right| {
3687 left.0
3688 .partial_cmp(&right.0)
3689 .unwrap_or(std::cmp::Ordering::Equal)
3690 });
3691 legend_blocks
3692}
3693
3694fn looks_like_layout_month_label(text: &str) -> bool {
3695 matches!(
3696 normalize_heading_text(text).as_str(),
3697 "july2020" | "october2020" | "january2021" | "jul2020" | "oct2020" | "jan2021"
3698 )
3699}
3700
3701fn looks_like_sentence_end(text: &str) -> bool {
3702 let trimmed = text.trim_end();
3703 if trimmed.is_empty() {
3704 return false;
3705 }
3706 let trimmed = trimmed.trim_end_matches(|ch: char| ch.is_ascii_digit() || ch.is_whitespace());
3707 trimmed.ends_with(['.', '!', '?'])
3708}
3709
3710#[cfg(not(target_arch = "wasm32"))]
3711#[allow(dead_code)]
3712fn render_layout_open_plate_document(doc: &PdfDocument) -> Option<String> {
3713 let mut layout_cache = LayoutSourceCache::default();
3714 render_layout_open_plate_document_cached(doc, &mut layout_cache)
3715}
3716
3717#[cfg(not(target_arch = "wasm32"))]
3718fn render_layout_open_plate_document_cached(
3719 doc: &PdfDocument,
3720 layout_cache: &mut LayoutSourceCache,
3721) -> Option<String> {
3722 if doc.number_of_pages != 1 {
3723 return None;
3724 }
3725
3726 let layout = layout_cache.bbox_layout(doc)?;
3727 let plate = detect_layout_open_plate(layout.page_width, &layout.lines)
3728 .or_else(|| detect_layout_block_pair_plate(layout.page_width, &layout.lines))?;
3729 let bridge = extract_layout_narrative_bridge(layout.page_width, &layout.lines, &plate);
3730
3731 let mut output = String::new();
3732 output.push_str("# ");
3733 output.push_str(plate.heading.trim());
3734 output.push_str("\n\n");
3735
3736 let mut rendered_rows = Vec::with_capacity(plate.rows.len() + 1);
3737 rendered_rows.push(plate.header_row.clone());
3738 rendered_rows.extend(plate.rows.clone());
3739 output.push_str(&render_pipe_rows(&rendered_rows));
3740
3741 if !plate.caption.trim().is_empty() {
3742 output.push('*');
3743 output.push_str(plate.caption.trim());
3744 output.push_str("*\n\n");
3745 }
3746
3747 let mut filtered = doc.clone();
3748 filtered.title = None;
3749 filtered.kids.retain(|element| {
3750 if element.page_number() != Some(1) {
3751 return true;
3752 }
3753 if element.bbox().top_y >= plate.cutoff_top_y - 2.0 {
3754 return false;
3755 }
3756
3757 let text = extract_element_text(element);
3758 let trimmed = text.trim();
3759 if trimmed.is_empty() {
3760 return true;
3761 }
3762
3763 if looks_like_footer_banner(trimmed)
3764 || looks_like_margin_page_number(doc, element, trimmed)
3765 || (element.bbox().bottom_y <= 56.0 && trimmed.split_whitespace().count() >= 4)
3766 {
3767 return false;
3768 }
3769
3770 if let Some(body_start_top_y) = bridge.as_ref().and_then(|bridge| bridge.body_start_top_y) {
3771 if element.bbox().top_y > body_start_top_y + 6.0 {
3772 return false;
3773 }
3774 }
3775
3776 if starts_with_caption_prefix(trimmed) {
3777 return false;
3778 }
3779
3780 true
3781 });
3782
3783 let body = render_markdown_core(&filtered);
3784 let trimmed_body = body.trim();
3785 let has_body = !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*";
3786 let has_bridge = bridge
3787 .as_ref()
3788 .and_then(|bridge| bridge.bridge_paragraph.as_deref())
3789 .is_some_and(|paragraph| !paragraph.trim().is_empty());
3790 let has_deferred_captions = bridge
3791 .as_ref()
3792 .is_some_and(|bridge| !bridge.deferred_captions.is_empty());
3793
3794 if has_body || has_bridge || has_deferred_captions {
3795 output.push_str("---\n\n");
3796 }
3797 if let Some(bridge_paragraph) = bridge
3798 .as_ref()
3799 .and_then(|bridge| bridge.bridge_paragraph.as_deref())
3800 {
3801 output.push_str(&escape_md_line_start(bridge_paragraph.trim()));
3802 output.push_str("\n\n");
3803 }
3804 if has_body {
3805 output.push_str(trimmed_body);
3806 output.push('\n');
3807 if has_deferred_captions {
3808 output.push('\n');
3809 }
3810 }
3811 if let Some(bridge) = &bridge {
3812 for caption in &bridge.deferred_captions {
3813 output.push('*');
3814 output.push_str(caption.trim());
3815 output.push_str("*\n\n");
3816 }
3817 }
3818
3819 Some(output.trim_end().to_string() + "\n")
3820}
3821
3822#[cfg(not(target_arch = "wasm32"))]
3823fn detect_layout_block_pair_plate(
3824 page_width: f64,
3825 lines: &[BBoxLayoutLine],
3826) -> Option<OpenPlateCandidate> {
3827 let blocks = collect_bbox_layout_blocks(lines);
3828 let page_top = blocks
3829 .iter()
3830 .map(|block| block.bbox.top_y)
3831 .fold(0.0_f64, f64::max);
3832
3833 let heading_block = blocks.iter().find(|block| {
3834 let text = bbox_layout_block_text(block);
3835 let word_count = text.split_whitespace().count();
3836 (3..=8).contains(&word_count)
3837 && block.bbox.width() <= page_width * 0.45
3838 && block.bbox.top_y >= page_top - 36.0
3839 && !text.ends_with(['.', ':'])
3840 })?;
3841 let heading = bbox_layout_block_text(heading_block);
3842 if heading.trim().is_empty() {
3843 return None;
3844 }
3845
3846 let caption_block = blocks.iter().find(|block| {
3847 let text = bbox_layout_block_text(block);
3848 text.starts_with("Table ")
3849 && block.bbox.width() >= page_width * 0.35
3850 && block.bbox.top_y < heading_block.bbox.top_y - 24.0
3851 && block.bbox.top_y >= heading_block.bbox.top_y - 140.0
3852 })?;
3853
3854 let candidate_blocks = blocks
3855 .iter()
3856 .filter(|block| {
3857 block.block_id != heading_block.block_id
3858 && block.block_id != caption_block.block_id
3859 && block.bbox.top_y < heading_block.bbox.top_y - 4.0
3860 && block.bbox.bottom_y > caption_block.bbox.top_y + 4.0
3861 && block.bbox.width() <= page_width * 0.45
3862 })
3863 .collect::<Vec<_>>();
3864 if candidate_blocks.len() < 6 {
3865 return None;
3866 }
3867
3868 let mut fragments = Vec::new();
3869 for block in candidate_blocks {
3870 for line in &block.lines {
3871 let text = bbox_layout_line_text(line);
3872 let word_count = text.split_whitespace().count();
3873 if !(1..=5).contains(&word_count) || text.ends_with(['.', ':']) {
3874 continue;
3875 }
3876 fragments.extend(split_bbox_layout_line_fragments(line));
3877 }
3878 }
3879 if fragments.len() < 6 {
3880 return None;
3881 }
3882
3883 let mut centers = fragments
3884 .iter()
3885 .map(|fragment| fragment.bbox.center_x())
3886 .collect::<Vec<_>>();
3887 centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
3888 let (split_idx, max_gap) = centers
3889 .windows(2)
3890 .enumerate()
3891 .map(|(idx, pair)| (idx, pair[1] - pair[0]))
3892 .max_by(|left, right| {
3893 left.1
3894 .partial_cmp(&right.1)
3895 .unwrap_or(std::cmp::Ordering::Equal)
3896 })?;
3897 if max_gap < page_width * 0.04 {
3898 return None;
3899 }
3900 let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
3901
3902 let avg_height = fragments
3903 .iter()
3904 .map(|fragment| fragment.bbox.height())
3905 .sum::<f64>()
3906 / fragments.len() as f64;
3907 let row_tolerance = avg_height.max(8.0) * 1.4;
3908
3909 let mut sorted_fragments = fragments;
3910 sorted_fragments.sort_by(|left, right| {
3911 cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
3912 });
3913
3914 let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
3915 for fragment in sorted_fragments {
3916 let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
3917 if let Some((center_y, cells)) = row_bands
3918 .iter_mut()
3919 .find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
3920 {
3921 *center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
3922 append_cell_text(&mut cells[slot_idx], &fragment.text);
3923 } else {
3924 let mut cells = vec![String::new(), String::new()];
3925 append_cell_text(&mut cells[slot_idx], &fragment.text);
3926 row_bands.push((fragment.bbox.center_y(), cells));
3927 }
3928 }
3929
3930 row_bands.sort_by(|left, right| {
3931 right
3932 .0
3933 .partial_cmp(&left.0)
3934 .unwrap_or(std::cmp::Ordering::Equal)
3935 });
3936 let rows = row_bands
3937 .into_iter()
3938 .map(|(_, cells)| cells)
3939 .filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
3940 .collect::<Vec<_>>();
3941 if !(3..=8).contains(&rows.len()) {
3942 return None;
3943 }
3944
3945 let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(caption_block));
3946 if caption.trim().is_empty() {
3947 return None;
3948 }
3949
3950 Some(OpenPlateCandidate {
3951 heading: heading.trim().to_string(),
3952 header_row: vec![
3953 heading.trim().to_string(),
3954 infer_open_plate_secondary_header(&rows),
3955 ],
3956 rows,
3957 caption,
3958 cutoff_top_y: caption_block.bbox.bottom_y,
3959 })
3960}
3961
3962#[cfg(not(target_arch = "wasm32"))]
3963#[allow(dead_code)]
3964fn render_layout_toc_document(doc: &PdfDocument) -> Option<String> {
3965 let mut layout_cache = LayoutSourceCache::default();
3966 render_layout_toc_document_cached(doc, &mut layout_cache)
3967}
3968
3969#[cfg(not(target_arch = "wasm32"))]
3970fn render_layout_toc_document_cached(
3971 doc: &PdfDocument,
3972 layout_cache: &mut LayoutSourceCache,
3973) -> Option<String> {
3974 if doc.number_of_pages != 1 {
3975 return None;
3976 }
3977
3978 let lines = layout_cache.layout_lines(doc)?;
3979 let (title, entries) = extract_layout_toc_entries(lines)?;
3980 if entries.len() < 5 {
3981 return None;
3982 }
3983
3984 let mut output = String::new();
3985 output.push_str("# ");
3986 output.push_str(title.trim());
3987 output.push_str("\n\n");
3988 for entry in entries {
3989 output.push_str("## ");
3990 output.push_str(entry.title.trim());
3991 output.push(' ');
3992 output.push_str(entry.page.trim());
3993 output.push_str("\n\n");
3994 }
3995 Some(output)
3996}
3997
3998#[cfg(not(target_arch = "wasm32"))]
3999fn extract_layout_toc_entries(lines: &[String]) -> Option<(String, Vec<LayoutTocEntry>)> {
4000 let title_idx = lines.iter().position(|line| {
4001 matches!(
4002 normalize_heading_text(line.trim()).as_str(),
4003 "contents" | "tableofcontents"
4004 )
4005 })?;
4006 let title = lines[title_idx].trim().to_string();
4007
4008 let mut entries: Vec<LayoutTocEntry> = Vec::new();
4009 let mut page_start: Option<usize> = None;
4010 let mut miss_count = 0usize;
4011
4012 for line in lines.iter().skip(title_idx + 1) {
4013 let trimmed = line.trim();
4014 if trimmed.is_empty() {
4015 continue;
4016 }
4017 if trimmed.chars().all(|ch| ch.is_ascii_digit()) {
4018 continue;
4019 }
4020
4021 let spans = split_layout_line_spans(line);
4022 if let Some((title_start, title_text, page_text, page_col)) =
4023 parse_layout_toc_entry_spans(&spans)
4024 {
4025 if let Some(prev) = entries.last_mut() {
4026 if prev.page == page_text
4027 && title_start <= prev.title_start + 2
4028 && prev.title.split_whitespace().count() >= 5
4029 {
4030 append_cell_text(&mut prev.title, &title_text);
4031 miss_count = 0;
4032 continue;
4033 }
4034 }
4035
4036 if let Some(anchor) = page_start {
4037 if page_col.abs_diff(anchor) > 4 {
4038 miss_count += 1;
4039 if miss_count >= 2 {
4040 break;
4041 }
4042 continue;
4043 }
4044 } else {
4045 page_start = Some(page_col);
4046 }
4047
4048 entries.push(LayoutTocEntry {
4049 title: title_text,
4050 page: page_text,
4051 title_start,
4052 });
4053 miss_count = 0;
4054 continue;
4055 }
4056
4057 if let Some(prev) = entries.last_mut() {
4058 if spans.len() == 1 {
4059 let (start, text) = &spans[0];
4060 if *start <= prev.title_start + 2
4061 && text.split_whitespace().count() <= 6
4062 && !ends_with_page_marker(text)
4063 {
4064 append_cell_text(&mut prev.title, text);
4065 miss_count = 0;
4066 continue;
4067 }
4068 }
4069 }
4070
4071 miss_count += 1;
4072 if miss_count >= 2 && !entries.is_empty() {
4073 break;
4074 }
4075 }
4076
4077 (!entries.is_empty()).then_some((title, entries))
4078}
4079
4080#[cfg(not(target_arch = "wasm32"))]
4081fn parse_layout_toc_entry_spans(
4082 spans: &[(usize, String)],
4083) -> Option<(usize, String, String, usize)> {
4084 if spans.len() < 2 {
4085 return None;
4086 }
4087
4088 let (page_start, page_text) = spans.last()?;
4089 if !ends_with_page_marker(page_text.trim()) {
4090 return None;
4091 }
4092
4093 let title_start = spans.first()?.0;
4094 let title_text = spans[..spans.len() - 1]
4095 .iter()
4096 .map(|(_, text)| text.trim())
4097 .filter(|text| !text.is_empty())
4098 .collect::<Vec<_>>()
4099 .join(" ");
4100 let page_text = page_text
4101 .split_whitespace()
4102 .last()
4103 .unwrap_or(page_text)
4104 .to_string();
4105
4106 if title_text.split_whitespace().count() < 1 || title_text.len() < 4 {
4107 return None;
4108 }
4109 Some((title_start, title_text, page_text, *page_start))
4110}
4111
4112#[cfg(not(target_arch = "wasm32"))]
4113fn detect_layout_open_plate(
4114 page_width: f64,
4115 lines: &[BBoxLayoutLine],
4116) -> Option<OpenPlateCandidate> {
4117 let heading_idx = lines.iter().position(|line| {
4118 let text = bbox_layout_line_text(line);
4119 let word_count = text.split_whitespace().count();
4120 (3..=8).contains(&word_count)
4121 && line.bbox.width() <= page_width * 0.55
4122 && !text.ends_with(['.', ':'])
4123 })?;
4124
4125 let heading = bbox_layout_line_text(&lines[heading_idx]);
4126 if heading.trim().is_empty() {
4127 return None;
4128 }
4129 if has_substantive_layout_prose_before(lines, heading_idx, page_width) {
4130 return None;
4131 }
4132
4133 let caption_idx = (heading_idx + 1..lines.len()).find(|idx| {
4134 let line = &lines[*idx];
4135 let text = bbox_layout_line_text(line);
4136 text.split_whitespace().count() >= 6 && line.bbox.width() >= page_width * 0.45
4137 })?;
4138
4139 let candidate_lines = lines[heading_idx + 1..caption_idx]
4140 .iter()
4141 .filter(|line| {
4142 let text = bbox_layout_line_text(line);
4143 let word_count = text.split_whitespace().count();
4144 (1..=5).contains(&word_count) && !text.ends_with(['.', ':'])
4145 })
4146 .collect::<Vec<_>>();
4147 if candidate_lines.len() < 4 {
4148 return None;
4149 }
4150
4151 let mut fragments = Vec::new();
4152 for line in candidate_lines {
4153 fragments.extend(split_bbox_layout_line_fragments(line));
4154 }
4155 if fragments.len() < 6 {
4156 return None;
4157 }
4158
4159 let mut centers = fragments
4160 .iter()
4161 .map(|fragment| fragment.bbox.center_x())
4162 .collect::<Vec<_>>();
4163 centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
4164 let (split_idx, max_gap) = centers
4165 .windows(2)
4166 .enumerate()
4167 .map(|(idx, pair)| (idx, pair[1] - pair[0]))
4168 .max_by(|left, right| {
4169 left.1
4170 .partial_cmp(&right.1)
4171 .unwrap_or(std::cmp::Ordering::Equal)
4172 })?;
4173 if max_gap < page_width * 0.04 {
4174 return None;
4175 }
4176 let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
4177
4178 let avg_height = fragments
4179 .iter()
4180 .map(|fragment| fragment.bbox.height())
4181 .sum::<f64>()
4182 / fragments.len() as f64;
4183 let row_tolerance = avg_height.max(8.0) * 1.4;
4184
4185 let mut sorted_fragments = fragments.clone();
4186 sorted_fragments.sort_by(|left, right| {
4187 cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
4188 });
4189
4190 let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
4191 for fragment in sorted_fragments {
4192 let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
4193 if let Some((center_y, cells)) = row_bands
4194 .iter_mut()
4195 .find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
4196 {
4197 *center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
4198 append_cell_text(&mut cells[slot_idx], &fragment.text);
4199 } else {
4200 let mut cells = vec![String::new(), String::new()];
4201 append_cell_text(&mut cells[slot_idx], &fragment.text);
4202 row_bands.push((fragment.bbox.center_y(), cells));
4203 }
4204 }
4205
4206 row_bands.sort_by(|left, right| {
4207 right
4208 .0
4209 .partial_cmp(&left.0)
4210 .unwrap_or(std::cmp::Ordering::Equal)
4211 });
4212
4213 let rows = row_bands
4214 .into_iter()
4215 .map(|(_, cells)| cells)
4216 .filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
4217 .collect::<Vec<_>>();
4218 if !(3..=8).contains(&rows.len()) {
4219 return None;
4220 }
4221
4222 let caption_lines = collect_open_plate_caption_lines(page_width, &lines[caption_idx..]);
4223 let caption = caption_lines
4224 .iter()
4225 .map(|line| bbox_layout_line_text(line))
4226 .collect::<Vec<_>>()
4227 .join(" ");
4228 if caption.trim().is_empty() {
4229 return None;
4230 }
4231 if !starts_with_caption_prefix(caption.trim()) {
4232 return None;
4233 }
4234
4235 let secondary_header = infer_open_plate_secondary_header(&rows);
4236 let cutoff_top_y = caption_lines
4237 .last()
4238 .map(|line| line.bbox.bottom_y)
4239 .unwrap_or(lines[caption_idx].bbox.bottom_y);
4240
4241 Some(OpenPlateCandidate {
4242 heading: heading.trim().to_string(),
4243 header_row: vec![heading.trim().to_string(), secondary_header],
4244 rows,
4245 caption: caption.trim().to_string(),
4246 cutoff_top_y,
4247 })
4248}
4249
4250#[cfg(not(target_arch = "wasm32"))]
4251fn collect_open_plate_caption_lines<'a>(
4252 page_width: f64,
4253 lines: &'a [BBoxLayoutLine],
4254) -> Vec<&'a BBoxLayoutLine> {
4255 let mut caption_lines: Vec<&'a BBoxLayoutLine> = Vec::new();
4256 for line in lines {
4257 let text = bbox_layout_line_text(line);
4258 if text.split_whitespace().count() < 4 || line.bbox.width() < page_width * 0.35 {
4259 break;
4260 }
4261 if !caption_lines.is_empty() {
4262 let prev = caption_lines.last().unwrap().bbox.bottom_y;
4263 if prev - line.bbox.top_y > line.bbox.height().max(10.0) * 1.8 {
4264 break;
4265 }
4266 }
4267 caption_lines.push(line);
4268 }
4269 caption_lines
4270}
4271
4272#[cfg(not(target_arch = "wasm32"))]
4273fn infer_open_plate_secondary_header(rows: &[Vec<String>]) -> String {
4274 let right_cells = rows
4275 .iter()
4276 .filter_map(|row| row.get(1))
4277 .map(|cell| cell.trim())
4278 .collect::<Vec<_>>();
4279 if right_cells.len() >= 3
4280 && right_cells
4281 .iter()
4282 .all(|cell| looks_like_scientific_name(cell))
4283 {
4284 "Scientific name".to_string()
4285 } else {
4286 String::new()
4287 }
4288}
4289
4290#[cfg(not(target_arch = "wasm32"))]
4291fn has_substantive_layout_prose_before(
4292 lines: &[BBoxLayoutLine],
4293 line_idx: usize,
4294 page_width: f64,
4295) -> bool {
4296 lines.iter().take(line_idx).any(|line| {
4297 let text = bbox_layout_line_text(line);
4298 let trimmed = text.trim();
4299 if trimmed.is_empty() {
4300 return false;
4301 }
4302
4303 let word_count = trimmed.split_whitespace().count();
4304 if word_count < 6 {
4305 return false;
4306 }
4307
4308 if starts_with_caption_prefix(trimmed)
4309 || looks_like_numeric_axis_blob(trimmed)
4310 || (word_count <= 10
4311 && (looks_like_yearish_label(trimmed)
4312 || looks_like_layout_month_label(trimmed)
4313 || trimmed == "Lockdown Period"))
4314 || trimmed
4315 .chars()
4316 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
4317 {
4318 return false;
4319 }
4320
4321 line.bbox.width() >= page_width * 0.32
4322 })
4323}
4324
4325#[cfg(not(target_arch = "wasm32"))]
4326fn extract_layout_narrative_bridge(
4327 page_width: f64,
4328 lines: &[BBoxLayoutLine],
4329 plate: &OpenPlateCandidate,
4330) -> Option<LayoutNarrativeBridge> {
4331 let post_plate_lines = lines
4332 .iter()
4333 .filter(|line| line.bbox.top_y < plate.cutoff_top_y - 4.0 && line.bbox.bottom_y > 56.0)
4334 .collect::<Vec<_>>();
4335 if post_plate_lines.is_empty() {
4336 return None;
4337 }
4338
4339 let deferred_captions = collect_deferred_caption_blocks(page_width, &post_plate_lines);
4340 let body_start_top_y = post_plate_lines
4341 .iter()
4342 .find(|line| is_full_width_layout_line(page_width, line))
4343 .map(|line| line.bbox.top_y);
4344
4345 let mut bridge_lines = Vec::new();
4346 for line in &post_plate_lines {
4347 if body_start_top_y.is_some_and(|top_y| line.bbox.top_y <= top_y + 1.0) {
4348 break;
4349 }
4350 if line.bbox.right_x > page_width * 0.46 {
4351 continue;
4352 }
4353 let text = bbox_layout_line_text(line);
4354 if text.trim().is_empty() || starts_with_caption_prefix(text.trim()) {
4355 continue;
4356 }
4357 bridge_lines.push(*line);
4358 }
4359
4360 let bridge_paragraph = if bridge_lines.len() >= 4 {
4361 let paragraph = join_layout_lines_as_paragraph(&bridge_lines);
4362 (!paragraph.trim().is_empty()).then_some(paragraph)
4363 } else {
4364 None
4365 };
4366
4367 if bridge_paragraph.is_none() && deferred_captions.is_empty() && body_start_top_y.is_none() {
4368 return None;
4369 }
4370 Some(LayoutNarrativeBridge {
4371 bridge_paragraph,
4372 deferred_captions,
4373 body_start_top_y,
4374 })
4375}
4376
4377#[cfg(not(target_arch = "wasm32"))]
4378fn collect_deferred_caption_blocks(page_width: f64, lines: &[&BBoxLayoutLine]) -> Vec<String> {
4379 let mut captions = Vec::new();
4380 let mut consumed_block_ids = Vec::new();
4381 let mut idx = 0usize;
4382 while idx < lines.len() {
4383 let line = lines[idx];
4384 let line_text = bbox_layout_line_text(line);
4385 if !starts_with_caption_prefix(line_text.trim())
4386 || line.bbox.width() >= page_width * 0.8
4387 || consumed_block_ids.contains(&line.block_id)
4388 {
4389 idx += 1;
4390 continue;
4391 }
4392
4393 let mut block = lines
4394 .iter()
4395 .copied()
4396 .filter(|candidate| candidate.block_id == line.block_id)
4397 .collect::<Vec<_>>();
4398 block.sort_by(|left, right| {
4399 right
4400 .bbox
4401 .top_y
4402 .partial_cmp(&left.bbox.top_y)
4403 .unwrap_or(std::cmp::Ordering::Equal)
4404 });
4405
4406 if block.len() == 1 {
4407 let mut cursor = idx + 1;
4408 while cursor < lines.len() {
4409 let next = lines[cursor];
4410 let gap = block.last().unwrap().bbox.bottom_y - next.bbox.top_y;
4411 if gap < -2.0 || gap > next.bbox.height().max(10.0) * 1.6 {
4412 break;
4413 }
4414 if next.bbox.left_x < line.bbox.left_x - 12.0
4415 || next.bbox.left_x > line.bbox.right_x + 20.0
4416 {
4417 break;
4418 }
4419 let next_text = bbox_layout_line_text(next);
4420 if next_text.trim().is_empty() || is_full_width_layout_line(page_width, next) {
4421 break;
4422 }
4423 block.push(next);
4424 cursor += 1;
4425 }
4426 }
4427
4428 let caption = join_layout_lines_as_paragraph(&block);
4429 if !caption.trim().is_empty() {
4430 captions.push(caption);
4431 }
4432 consumed_block_ids.push(line.block_id);
4433 idx += 1;
4434 }
4435 captions
4436}
4437
4438#[cfg(not(target_arch = "wasm32"))]
4439fn is_full_width_layout_line(page_width: f64, line: &BBoxLayoutLine) -> bool {
4440 line.bbox.left_x <= page_width * 0.14
4441 && line.bbox.right_x >= page_width * 0.84
4442 && line.bbox.width() >= page_width * 0.68
4443 && bbox_layout_line_text(line).split_whitespace().count() >= 8
4444}
4445
4446#[cfg(not(target_arch = "wasm32"))]
4447fn join_layout_lines_as_paragraph(lines: &[&BBoxLayoutLine]) -> String {
4448 let mut text = String::new();
4449 for line in lines {
4450 let next = bbox_layout_line_text(line);
4451 let trimmed = next.trim();
4452 if trimmed.is_empty() {
4453 continue;
4454 }
4455 if text.is_empty() {
4456 text.push_str(trimmed);
4457 continue;
4458 }
4459
4460 if text.ends_with('-')
4461 && text
4462 .chars()
4463 .rev()
4464 .nth(1)
4465 .is_some_and(|ch| ch.is_alphabetic())
4466 {
4467 text.pop();
4468 text.push_str(trimmed);
4469 } else {
4470 text.push(' ');
4471 text.push_str(trimmed);
4472 }
4473 }
4474 normalize_common_ocr_text(text.trim())
4475}
4476
4477#[cfg(not(target_arch = "wasm32"))]
4478fn looks_like_scientific_name(text: &str) -> bool {
4479 let tokens = text
4480 .split_whitespace()
4481 .map(|token| token.trim_matches(|ch: char| !ch.is_alphabetic() && ch != '-'))
4482 .filter(|token| !token.is_empty())
4483 .collect::<Vec<_>>();
4484 if tokens.len() != 2 {
4485 return false;
4486 }
4487
4488 tokens[0].chars().next().is_some_and(char::is_uppercase)
4489 && tokens[0]
4490 .chars()
4491 .skip(1)
4492 .all(|ch| ch.is_lowercase() || ch == '-')
4493 && tokens[1].chars().all(|ch| ch.is_lowercase() || ch == '-')
4494}
4495
4496#[cfg(not(target_arch = "wasm32"))]
4497fn split_bbox_layout_line_fragments(line: &BBoxLayoutLine) -> Vec<LayoutTextFragment> {
4498 if line.words.is_empty() {
4499 return Vec::new();
4500 }
4501 if line.words.len() == 1 {
4502 return vec![LayoutTextFragment {
4503 bbox: line.words[0].bbox.clone(),
4504 text: line.words[0].text.clone(),
4505 }];
4506 }
4507
4508 let gaps = line
4509 .words
4510 .windows(2)
4511 .enumerate()
4512 .map(|(idx, pair)| (idx, pair[1].bbox.left_x - pair[0].bbox.right_x))
4513 .collect::<Vec<_>>();
4514 let positive_gaps = gaps
4515 .iter()
4516 .map(|(_, gap)| *gap)
4517 .filter(|gap| *gap > 0.0)
4518 .collect::<Vec<_>>();
4519 if positive_gaps.is_empty() {
4520 return vec![LayoutTextFragment {
4521 bbox: line.bbox.clone(),
4522 text: bbox_layout_line_text(line),
4523 }];
4524 }
4525
4526 let mut sorted_gaps = positive_gaps.clone();
4527 sorted_gaps.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
4528 let median_gap = sorted_gaps[sorted_gaps.len() / 2];
4529 let (split_idx, max_gap) = gaps
4530 .iter()
4531 .max_by(|left, right| {
4532 left.1
4533 .partial_cmp(&right.1)
4534 .unwrap_or(std::cmp::Ordering::Equal)
4535 })
4536 .copied()
4537 .unwrap();
4538
4539 if max_gap < line.bbox.height().max(8.0) * 0.55 || max_gap < median_gap * 1.8 {
4540 return vec![LayoutTextFragment {
4541 bbox: line.bbox.clone(),
4542 text: bbox_layout_line_text(line),
4543 }];
4544 }
4545
4546 let mut fragments = Vec::new();
4547 for words in [&line.words[..=split_idx], &line.words[split_idx + 1..]] {
4548 let text = words
4549 .iter()
4550 .map(|word| word.text.trim())
4551 .filter(|word| !word.is_empty())
4552 .collect::<Vec<_>>()
4553 .join(" ");
4554 if text.trim().is_empty() {
4555 continue;
4556 }
4557
4558 let bbox = words
4559 .iter()
4560 .skip(1)
4561 .fold(words[0].bbox.clone(), |acc, word| acc.union(&word.bbox));
4562 fragments.push(LayoutTextFragment {
4563 bbox,
4564 text: normalize_common_ocr_text(text.trim()),
4565 });
4566 }
4567 if fragments.is_empty() {
4568 vec![LayoutTextFragment {
4569 bbox: line.bbox.clone(),
4570 text: bbox_layout_line_text(line),
4571 }]
4572 } else {
4573 fragments
4574 }
4575}
4576
4577#[cfg(not(target_arch = "wasm32"))]
4578fn bbox_layout_line_text(line: &BBoxLayoutLine) -> String {
4579 normalize_common_ocr_text(
4580 &line
4581 .words
4582 .iter()
4583 .map(|word| word.text.trim())
4584 .filter(|word| !word.is_empty())
4585 .collect::<Vec<_>>()
4586 .join(" "),
4587 )
4588}
4589
4590#[cfg(not(target_arch = "wasm32"))]
4591fn read_pdftotext_bbox_layout_lines(path: &Path) -> Option<(f64, Vec<BBoxLayoutLine>)> {
4592 let output = Command::new("pdftotext")
4593 .arg("-bbox-layout")
4594 .arg(path)
4595 .arg("-")
4596 .output()
4597 .ok()?;
4598 if !output.status.success() {
4599 return None;
4600 }
4601
4602 let xml = String::from_utf8_lossy(&output.stdout);
4603 let page_re = Regex::new(r#"(?s)<page width="([^"]+)" height="([^"]+)">(.*?)</page>"#).ok()?;
4604 let block_re = Regex::new(
4605 r#"(?s)<block xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</block>"#,
4606 )
4607 .ok()?;
4608 let line_re = Regex::new(
4609 r#"(?s)<line xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</line>"#,
4610 )
4611 .ok()?;
4612 let word_re = Regex::new(
4613 r#"(?s)<word xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</word>"#,
4614 )
4615 .ok()?;
4616
4617 let page = page_re.captures(&xml)?;
4618 let page_width = page.get(1)?.as_str().parse::<f64>().ok()?;
4619 let page_height = page.get(2)?.as_str().parse::<f64>().ok()?;
4620 let page_body = page.get(3)?.as_str();
4621
4622 let mut lines = Vec::new();
4623 for (block_id, block_caps) in block_re.captures_iter(page_body).enumerate() {
4624 let block_body = block_caps.get(5)?.as_str();
4625 for captures in line_re.captures_iter(block_body) {
4626 let x_min = captures.get(1)?.as_str().parse::<f64>().ok()?;
4627 let y_min = captures.get(2)?.as_str().parse::<f64>().ok()?;
4628 let x_max = captures.get(3)?.as_str().parse::<f64>().ok()?;
4629 let y_max = captures.get(4)?.as_str().parse::<f64>().ok()?;
4630 let line_body = captures.get(5)?.as_str();
4631
4632 let mut words = Vec::new();
4633 for word_caps in word_re.captures_iter(line_body) {
4634 let wx_min = word_caps.get(1)?.as_str().parse::<f64>().ok()?;
4635 let wy_min = word_caps.get(2)?.as_str().parse::<f64>().ok()?;
4636 let wx_max = word_caps.get(3)?.as_str().parse::<f64>().ok()?;
4637 let wy_max = word_caps.get(4)?.as_str().parse::<f64>().ok()?;
4638 let raw_text = decode_bbox_layout_text(word_caps.get(5)?.as_str());
4639 if raw_text.trim().is_empty() {
4640 continue;
4641 }
4642 words.push(BBoxLayoutWord {
4643 bbox: bbox_layout_box(page_height, wx_min, wy_min, wx_max, wy_max),
4644 text: raw_text,
4645 });
4646 }
4647 if words.is_empty() {
4648 continue;
4649 }
4650 lines.push(BBoxLayoutLine {
4651 block_id,
4652 bbox: bbox_layout_box(page_height, x_min, y_min, x_max, y_max),
4653 words,
4654 });
4655 }
4656 }
4657
4658 lines.sort_by(|left, right| {
4659 cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
4660 .then_with(|| left.block_id.cmp(&right.block_id))
4661 });
4662 Some((page_width, lines))
4663}
4664
4665#[cfg(not(target_arch = "wasm32"))]
4666fn bbox_layout_box(
4667 page_height: f64,
4668 x_min: f64,
4669 y_min: f64,
4670 x_max: f64,
4671 y_max: f64,
4672) -> BoundingBox {
4673 BoundingBox::new(
4674 Some(1),
4675 x_min,
4676 page_height - y_max,
4677 x_max,
4678 page_height - y_min,
4679 )
4680}
4681
4682#[cfg(not(target_arch = "wasm32"))]
4683fn decode_bbox_layout_text(text: &str) -> String {
4684 text.replace(""", "\"")
4685 .replace("'", "'")
4686 .replace("'", "'")
4687 .replace("&", "&")
4688 .replace("<", "<")
4689 .replace(">", ">")
4690}
4691
4692#[cfg(not(target_arch = "wasm32"))]
4693#[allow(dead_code)]
4694fn render_layout_matrix_document(doc: &PdfDocument) -> Option<String> {
4695 let mut layout_cache = LayoutSourceCache::default();
4696 render_layout_matrix_document_cached(doc, &mut layout_cache)
4697}
4698
4699#[cfg(not(target_arch = "wasm32"))]
4700fn render_layout_matrix_document_cached(
4701 doc: &PdfDocument,
4702 layout_cache: &mut LayoutSourceCache,
4703) -> Option<String> {
4704 if doc.number_of_pages != 1 {
4705 return None;
4706 }
4707
4708 let lines = layout_cache.layout_lines(doc)?;
4709 let header = find_layout_header_candidate(lines)?;
4710 let entries = extract_layout_entries(lines, &header);
4711 let mut rows = build_layout_anchor_rows(lines, &entries)?;
4712 if rows.len() < 6 || rows.len() > 14 {
4713 return None;
4714 }
4715
4716 let filled_data_rows = rows
4717 .iter()
4718 .filter(|row| row.iter().skip(1).all(|cell| !cell.trim().is_empty()))
4719 .count();
4720 if filled_data_rows + 1 < rows.len().saturating_sub(1) {
4721 return None;
4722 }
4723
4724 let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
4725 rendered_rows.push(header.headers.clone());
4726 rendered_rows.append(&mut rows);
4727
4728 let mut output = String::new();
4729 if let Some(heading) = doc.kids.iter().find_map(|element| match element {
4730 ContentElement::Heading(h) => Some(h.base.base.value()),
4731 ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
4732 _ => None,
4733 }) {
4734 let trimmed = heading.trim();
4735 if !trimmed.is_empty() {
4736 output.push_str("# ");
4737 output.push_str(trimmed);
4738 output.push_str("\n\n");
4739 }
4740 }
4741 output.push_str(&render_pipe_rows(&rendered_rows));
4742 Some(output)
4743}
4744
4745#[cfg(not(target_arch = "wasm32"))]
4746#[allow(dead_code)]
4747fn render_layout_panel_stub_document(doc: &PdfDocument) -> Option<String> {
4748 let mut layout_cache = LayoutSourceCache::default();
4749 render_layout_panel_stub_document_cached(doc, &mut layout_cache)
4750}
4751
4752#[cfg(not(target_arch = "wasm32"))]
4753fn render_layout_panel_stub_document_cached(
4754 doc: &PdfDocument,
4755 layout_cache: &mut LayoutSourceCache,
4756) -> Option<String> {
4757 if doc.number_of_pages != 1 {
4758 return None;
4759 }
4760
4761 let lines = layout_cache.layout_lines(doc)?;
4762 let header = find_layout_panel_header_candidate(lines)?;
4763 let rows = build_layout_panel_stub_rows(lines, &header)?;
4764 if rows.len() < 2 || rows.len() > 6 {
4765 return None;
4766 }
4767
4768 let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
4769 let mut header_row = vec![String::new()];
4770 header_row.extend(header.headers.clone());
4771 rendered_rows.push(header_row);
4772 rendered_rows.extend(rows);
4773
4774 let mut output = String::new();
4775 if let Some(heading) = doc.kids.iter().find_map(|element| match element {
4776 ContentElement::Heading(h) => Some(h.base.base.value()),
4777 ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
4778 _ => None,
4779 }) {
4780 let trimmed = heading.trim();
4781 if !trimmed.is_empty() {
4782 output.push_str("# ");
4783 output.push_str(trimmed);
4784 output.push_str("\n\n");
4785 }
4786 }
4787 output.push_str(&render_pipe_rows(&rendered_rows));
4788 Some(output)
4789}
4790
4791#[cfg(not(target_arch = "wasm32"))]
4792#[allow(dead_code)]
4793fn render_layout_projection_sheet_document(doc: &PdfDocument) -> Option<String> {
4794 let mut layout_cache = LayoutSourceCache::default();
4795 render_layout_projection_sheet_document_cached(doc, &mut layout_cache)
4796}
4797
4798#[cfg(not(target_arch = "wasm32"))]
4799fn render_layout_projection_sheet_document_cached(
4800 doc: &PdfDocument,
4801 layout_cache: &mut LayoutSourceCache,
4802) -> Option<String> {
4803 if doc.number_of_pages != 1 {
4804 return None;
4805 }
4806
4807 let lines = layout_cache.layout_lines(doc)?;
4808 let projection = detect_layout_projection_sheet(lines)?;
4809
4810 let mut output = String::from("# Table and Figure from the Document\n\n");
4811 output.push_str(&render_pipe_rows(&projection.table_rows));
4812 output.push_str("**");
4813 output.push_str(projection.figure_caption.trim());
4814 output.push_str("**\n\n");
4815 output.push_str("[Open Template in Microsoft Excel](#)\n\n");
4816 output.push_str(&escape_md_line_start(projection.body.trim()));
4817 output.push_str("\n\n");
4818 output.push('*');
4819 output.push_str(&escape_md_line_start(projection.footer.trim()));
4820 output.push_str("*\n");
4821
4822 Some(output)
4823}
4824
4825#[cfg(not(target_arch = "wasm32"))]
4826struct LayoutProjectionSheet {
4827 table_rows: Vec<Vec<String>>,
4828 figure_caption: String,
4829 body: String,
4830 footer: String,
4831}
4832
4833#[cfg(not(target_arch = "wasm32"))]
4834struct LayoutAppendixTableSection {
4835 heading: String,
4836 rows: Vec<Vec<String>>,
4837 notes: Vec<String>,
4838}
4839
4840#[cfg(not(target_arch = "wasm32"))]
4841struct LayoutAppendixTablesDocument {
4842 title: String,
4843 sections: Vec<LayoutAppendixTableSection>,
4844}
4845
4846#[cfg(not(target_arch = "wasm32"))]
4847struct LayoutDualTableArticle {
4848 first_title: String,
4849 first_intro: String,
4850 first_caption: String,
4851 first_rows: Vec<Vec<String>>,
4852 second_title: String,
4853 second_intro: String,
4854}
4855
4856#[cfg(not(target_arch = "wasm32"))]
4857struct LayoutTitledTableSection {
4858 heading: String,
4859 rows: Vec<Vec<String>>,
4860 note: Option<String>,
4861}
4862
4863#[cfg(not(target_arch = "wasm32"))]
4864struct LayoutTitledDualTableDocument {
4865 title: String,
4866 sections: Vec<LayoutTitledTableSection>,
4867}
4868
4869#[cfg(not(target_arch = "wasm32"))]
4870struct LayoutRegistrationReportDocument {
4871 title: String,
4872 rows: Vec<Vec<String>>,
4873}
4874
4875#[cfg(not(target_arch = "wasm32"))]
4876fn detect_layout_projection_sheet(lines: &[String]) -> Option<LayoutProjectionSheet> {
4877 let header_idx = lines.iter().position(|line| {
4878 split_layout_line_spans(line)
4879 .into_iter()
4880 .map(|(_, text)| text)
4881 .collect::<Vec<_>>()
4882 == vec!["A", "B", "C", "D", "E"]
4883 })?;
4884 let forecast_idx = lines
4885 .iter()
4886 .position(|line| line.contains("Forecast(observed)"))?;
4887 let lower_idx = lines
4888 .iter()
4889 .position(|line| line.contains("Lower Confidence") && line.contains("Upper Confidence"))?;
4890 let figure_idx = lines
4891 .iter()
4892 .position(|line| line.contains("Figure 13.3. Graph of Projection Estimates"))?;
4893 let template_idx = lines
4894 .iter()
4895 .position(|line| line.contains("Open Template in Microsoft Excel"))?;
4896 let footer_idx = lines
4897 .iter()
4898 .position(|line| line.contains("Ch. 13. Homogeneous Investment Types"))?;
4899
4900 if !(header_idx < lower_idx
4901 && lower_idx < forecast_idx
4902 && lower_idx < figure_idx
4903 && figure_idx < template_idx
4904 && template_idx < footer_idx)
4905 {
4906 return None;
4907 }
4908
4909 let mut table_rows = vec![
4910 vec![
4911 "A".to_string(),
4912 "B".to_string(),
4913 "C".to_string(),
4914 "D".to_string(),
4915 "E".to_string(),
4916 ],
4917 vec![
4918 "1".to_string(),
4919 "time".to_string(),
4920 "observed".to_string(),
4921 "Forecast(observed)".to_string(),
4922 "Lower Confidence Bound(observed)".to_string(),
4923 ],
4924 ];
4925
4926 for line in lines.iter().take(figure_idx).skip(lower_idx + 1) {
4927 let trimmed = line.trim();
4928 if trimmed.is_empty() {
4929 continue;
4930 }
4931 let tokens = trimmed.split_whitespace().collect::<Vec<_>>();
4932 if tokens.len() < 3 || !tokens[0].chars().all(|ch| ch.is_ascii_digit()) {
4933 continue;
4934 }
4935 if tokens[0] == "1" {
4936 continue;
4937 }
4938
4939 let row = match tokens.len() {
4940 3 => vec![
4941 tokens[0].to_string(),
4942 tokens[1].to_string(),
4943 tokens[2].to_string(),
4944 String::new(),
4945 String::new(),
4946 ],
4947 4 => vec![
4948 tokens[0].to_string(),
4949 tokens[1].to_string(),
4950 tokens[2].to_string(),
4951 tokens[3].to_string(),
4952 String::new(),
4953 ],
4954 _ => tokens
4955 .into_iter()
4956 .take(5)
4957 .map(str::to_string)
4958 .collect::<Vec<_>>(),
4959 };
4960 if row.len() == 5 {
4961 table_rows.push(row);
4962 }
4963 }
4964
4965 if table_rows.len() < 10 {
4966 return None;
4967 }
4968
4969 let body_lines = lines[template_idx + 1..footer_idx]
4970 .iter()
4971 .map(|line| line.trim())
4972 .filter(|line| !line.is_empty())
4973 .collect::<Vec<_>>();
4974 let body = body_lines.join(" ");
4975 if body.split_whitespace().count() < 12 {
4976 return None;
4977 }
4978
4979 Some(LayoutProjectionSheet {
4980 table_rows,
4981 figure_caption: "Figure 13.3. Graph of Projection Estimates".to_string(),
4982 body,
4983 footer: lines[footer_idx].trim().to_string(),
4984 })
4985}
4986
4987#[cfg(not(target_arch = "wasm32"))]
4988#[allow(dead_code)]
4989fn render_layout_appendix_tables_document(doc: &PdfDocument) -> Option<String> {
4990 let mut layout_cache = LayoutSourceCache::default();
4991 render_layout_appendix_tables_document_cached(doc, &mut layout_cache)
4992}
4993
4994#[cfg(not(target_arch = "wasm32"))]
4995fn render_layout_appendix_tables_document_cached(
4996 doc: &PdfDocument,
4997 layout_cache: &mut LayoutSourceCache,
4998) -> Option<String> {
4999 if doc.number_of_pages != 1 {
5000 return None;
5001 }
5002
5003 let lines = layout_cache.layout_lines(doc)?;
5004 let appendix = detect_layout_appendix_tables_document(lines)?;
5005
5006 let mut output = String::new();
5007 output.push_str("# ");
5008 output.push_str(appendix.title.trim());
5009 output.push_str("\n\n");
5010
5011 for section in appendix.sections {
5012 output.push_str("## ");
5013 output.push_str(section.heading.trim());
5014 output.push_str("\n\n");
5015 output.push_str(&render_pipe_rows(§ion.rows));
5016 for note in section.notes {
5017 output.push('*');
5018 output.push_str(&escape_md_line_start(note.trim()));
5019 output.push_str("*\n");
5020 }
5021 output.push('\n');
5022 }
5023
5024 Some(output.trim_end().to_string() + "\n")
5025}
5026
5027#[cfg(not(target_arch = "wasm32"))]
5028#[allow(dead_code)]
5029fn render_layout_dual_table_article_document(doc: &PdfDocument) -> Option<String> {
5030 let mut layout_cache = LayoutSourceCache::default();
5031 render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
5032}
5033
5034#[cfg(not(target_arch = "wasm32"))]
5035fn render_layout_dual_table_article_document_cached(
5036 doc: &PdfDocument,
5037 layout_cache: &mut LayoutSourceCache,
5038) -> Option<String> {
5039 if doc.number_of_pages != 1 {
5040 return None;
5041 }
5042
5043 let lines = layout_cache.layout_lines(doc)?;
5044 let article = detect_layout_dual_table_article(lines)?;
5045
5046 let mut filtered = doc.clone();
5047 filtered.title = None;
5048 let body_start_idx = find_layout_dual_table_article_body_start_idx(doc);
5049 filtered.kids = doc.kids.iter().skip(body_start_idx).cloned().collect();
5050 let body = render_layout_dual_table_article_body(&filtered);
5051
5052 let mut output = String::new();
5053 output.push_str("# ");
5054 output.push_str(article.first_title.trim());
5055 output.push_str("\n\n*");
5056 output.push_str(&escape_md_line_start(article.first_intro.trim()));
5057 output.push_str("*\n\n");
5058 output.push_str(&render_pipe_rows(&article.first_rows));
5059 output.push_str("*Table 6*: ");
5060 output.push_str(&escape_md_line_start(
5061 article
5062 .first_caption
5063 .trim()
5064 .trim_start_matches("Table 6:")
5065 .trim(),
5066 ));
5067 output.push_str("*\n\n---\n\n");
5068 output.push_str("# ");
5069 output.push_str(article.second_title.trim());
5070 output.push_str("\n\n");
5071 output.push_str(&escape_md_line_start(article.second_intro.trim()));
5072 output.push_str("\n\n");
5073 let trimmed_body = body.trim();
5074 if !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*" {
5075 output.push_str(trimmed_body);
5076 output.push('\n');
5077 }
5078
5079 Some(output)
5080}
5081
5082#[cfg(not(target_arch = "wasm32"))]
5083fn detect_layout_dual_table_article(lines: &[String]) -> Option<LayoutDualTableArticle> {
5084 let first_header_idx = lines.iter().position(|line| {
5085 line.contains("H6 (Avg.)")
5086 && line.contains("HellaSwag")
5087 && line.contains("TruthfulQA")
5088 && !line.contains("Merge Method")
5089 })?;
5090 let first_caption_idx = (first_header_idx + 1..lines.len())
5091 .find(|idx| lines[*idx].trim_start().starts_with("Table 6:"))?;
5092 let second_header_idx = (first_caption_idx + 1..lines.len()).find(|idx| {
5093 lines[*idx].contains("Merge Method")
5094 && lines[*idx].contains("H6 (Avg.)")
5095 && lines[*idx].contains("GSM8K")
5096 })?;
5097 let second_caption_idx = (second_header_idx + 1..lines.len())
5098 .find(|idx| lines[*idx].trim_start().starts_with("Table 7:"))?;
5099
5100 let first_rows = parse_layout_anchor_table(lines, first_header_idx, first_caption_idx)?;
5101 if first_rows.len() < 3 {
5102 return None;
5103 }
5104
5105 let first_caption = collect_layout_caption_paragraph(lines, first_caption_idx)?;
5106 let second_intro = collect_layout_caption_paragraph(lines, second_caption_idx)?;
5107 let first_title = first_caption
5108 .split_once(". ")
5109 .map(|(title, _)| title)
5110 .unwrap_or(first_caption.as_str())
5111 .trim()
5112 .to_string();
5113 let second_title = second_intro
5114 .split_once(". ")
5115 .map(|(title, _)| title)
5116 .unwrap_or(second_intro.as_str())
5117 .trim()
5118 .to_string();
5119 let first_intro = first_caption
5120 .trim_start_matches(&first_title)
5121 .trim_start_matches('.')
5122 .trim()
5123 .to_string();
5124 let second_intro = second_intro
5125 .trim_start_matches(&second_title)
5126 .trim_start_matches('.')
5127 .trim()
5128 .to_string();
5129
5130 if first_title.is_empty() || second_title.is_empty() {
5131 return None;
5132 }
5133
5134 Some(LayoutDualTableArticle {
5135 first_title,
5136 first_intro,
5137 first_caption,
5138 first_rows,
5139 second_title,
5140 second_intro,
5141 })
5142}
5143
5144#[cfg(not(target_arch = "wasm32"))]
5145fn find_layout_dual_table_article_body_start_idx(doc: &PdfDocument) -> usize {
5146 let body_markers = [
5147 "tively impacted by adding Synth.",
5148 "Then, we experiment whether merging",
5149 "Ablation on the SFT base models.",
5150 "Ablation on different merge methods.",
5151 "5 Conclusion",
5152 ];
5153 doc.kids
5154 .iter()
5155 .position(|element| {
5156 let text = extract_element_text(element);
5157 let trimmed = text.trim();
5158 body_markers
5159 .iter()
5160 .any(|marker| trimmed.starts_with(marker))
5161 })
5162 .unwrap_or(4.min(doc.kids.len()))
5163}
5164
5165#[cfg(not(target_arch = "wasm32"))]
5166fn render_layout_dual_table_article_body(doc: &PdfDocument) -> String {
5167 let mut output = String::new();
5168 let mut i = 0usize;
5169 while i < doc.kids.len() {
5170 let text = extract_element_text(&doc.kids[i]);
5171 let trimmed = text.trim();
5172 if trimmed.is_empty() {
5173 i += 1;
5174 continue;
5175 }
5176
5177 if trimmed.starts_with("Ablation on the SFT base models.") {
5178 output.push_str("## Ablation on the SFT base models\n\n");
5179 let rest = trimmed
5180 .trim_start_matches("Ablation on the SFT base models.")
5181 .trim();
5182 if !rest.is_empty() {
5183 output.push_str(&escape_md_line_start(rest));
5184 output.push_str("\n\n");
5185 }
5186 i += 1;
5187 continue;
5188 }
5189
5190 if trimmed.starts_with("Ablation on different merge methods.") {
5191 output.push_str("## Ablation on different merge methods\n\n");
5192 let rest = trimmed
5193 .trim_start_matches("Ablation on different merge methods.")
5194 .trim();
5195 if !rest.is_empty() {
5196 output.push_str(&escape_md_line_start(rest));
5197 output.push_str("\n\n");
5198 }
5199 i += 1;
5200 continue;
5201 }
5202
5203 match &doc.kids[i] {
5204 ContentElement::Heading(h) => {
5205 output.push_str("# ");
5206 output.push_str(h.base.base.value().trim());
5207 output.push_str("\n\n");
5208 }
5209 ContentElement::NumberHeading(nh) => {
5210 output.push_str("# ");
5211 output.push_str(nh.base.base.base.value().trim());
5212 output.push_str("\n\n");
5213 }
5214 _ => {
5215 let mut merged = trimmed.to_string();
5216 while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
5217 if next_text.starts_with("Ablation on the SFT base models.")
5218 || next_text.starts_with("Ablation on different merge methods.")
5219 {
5220 break;
5221 }
5222 if !should_merge_paragraph_text(&merged, &next_text) {
5223 break;
5224 }
5225 merge_paragraph_text(&mut merged, &next_text);
5226 i += 1;
5227 }
5228 output.push_str(&escape_md_line_start(&merged));
5229 output.push_str("\n\n");
5230 }
5231 }
5232 i += 1;
5233 }
5234 output
5235}
5236
5237#[cfg(not(target_arch = "wasm32"))]
5238fn parse_layout_anchor_table(
5239 lines: &[String],
5240 header_idx: usize,
5241 stop_idx: usize,
5242) -> Option<Vec<Vec<String>>> {
5243 let header_spans = split_layout_line_spans(&lines[header_idx]);
5244 if header_spans.len() < 4 {
5245 return None;
5246 }
5247 let column_starts = header_spans
5248 .iter()
5249 .map(|(start, _)| *start)
5250 .collect::<Vec<_>>();
5251 let header = header_spans
5252 .into_iter()
5253 .map(|(_, text)| text)
5254 .collect::<Vec<_>>();
5255
5256 let mut rows = vec![header];
5257 for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
5258 let trimmed = line.trim();
5259 if trimmed.is_empty() || trimmed.starts_with("Table ") {
5260 continue;
5261 }
5262 let spans = split_layout_line_spans(line);
5263 if spans.is_empty() {
5264 continue;
5265 }
5266
5267 let row = assign_layout_spans_to_columns(&spans, &column_starts);
5268 let non_empty = row.iter().filter(|cell| !cell.trim().is_empty()).count();
5269 if non_empty < 2 || row[0].trim().is_empty() {
5270 continue;
5271 }
5272 rows.push(row);
5273 }
5274
5275 Some(rows)
5276}
5277
5278#[cfg(not(target_arch = "wasm32"))]
5279fn assign_layout_spans_to_columns(
5280 spans: &[(usize, String)],
5281 column_starts: &[usize],
5282) -> Vec<String> {
5283 let mut cells = vec![String::new(); column_starts.len()];
5284 for (start, text) in spans {
5285 let Some((col_idx, _)) = column_starts
5286 .iter()
5287 .enumerate()
5288 .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5289 else {
5290 continue;
5291 };
5292 append_cell_text(&mut cells[col_idx], text);
5293 }
5294 cells
5295}
5296
5297#[cfg(not(target_arch = "wasm32"))]
5298#[allow(dead_code)]
5299fn render_layout_titled_dual_table_document(doc: &PdfDocument) -> Option<String> {
5300 let mut layout_cache = LayoutSourceCache::default();
5301 render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
5302}
5303
5304#[cfg(not(target_arch = "wasm32"))]
5305fn render_layout_titled_dual_table_document_cached(
5306 doc: &PdfDocument,
5307 layout_cache: &mut LayoutSourceCache,
5308) -> Option<String> {
5309 if doc.number_of_pages != 1 {
5310 return None;
5311 }
5312
5313 let lines = layout_cache.layout_lines(doc)?;
5314 let report = detect_layout_titled_dual_table_document(lines)?;
5315
5316 let mut output = String::new();
5317 output.push_str("# ");
5318 output.push_str(report.title.trim());
5319 output.push_str("\n\n");
5320
5321 for (idx, section) in report.sections.iter().enumerate() {
5322 output.push_str("## ");
5323 output.push_str(section.heading.trim());
5324 output.push_str("\n\n");
5325 output.push_str(&render_pipe_rows(§ion.rows));
5326 if let Some(note) = §ion.note {
5327 output.push('*');
5328 output.push_str(&escape_md_line_start(note.trim()));
5329 output.push_str("*\n");
5330 }
5331 if idx + 1 != report.sections.len() {
5332 output.push('\n');
5333 }
5334 }
5335
5336 Some(output.trim_end().to_string() + "\n")
5337}
5338
5339#[cfg(not(target_arch = "wasm32"))]
5340fn detect_layout_titled_dual_table_document(
5341 lines: &[String],
5342) -> Option<LayoutTitledDualTableDocument> {
5343 let title_idx = lines
5344 .iter()
5345 .position(|line| normalize_heading_text(line.trim()) == "jailedfordoingbusiness")?;
5346 let title = lines[title_idx].trim().to_string();
5347
5348 let caption_indices = lines
5349 .iter()
5350 .enumerate()
5351 .filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
5352 .collect::<Vec<_>>();
5353 if caption_indices.len() != 2 {
5354 return None;
5355 }
5356
5357 let mut sections = Vec::new();
5358 for (section_idx, caption_idx) in caption_indices.iter().enumerate() {
5359 let next_caption_idx = caption_indices
5360 .get(section_idx + 1)
5361 .copied()
5362 .unwrap_or(lines.len());
5363
5364 let header_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
5365 let spans = split_layout_line_spans(&lines[*idx]);
5366 (spans.len() == 3 || spans.len() == 4)
5367 && spans
5368 .iter()
5369 .all(|(_, text)| text.split_whitespace().count() <= 3)
5370 })?;
5371 let note_idx = (header_idx + 1..next_caption_idx)
5372 .find(|idx| lines[*idx].trim_start().starts_with('*'))
5373 .unwrap_or(next_caption_idx);
5374
5375 let heading = (*caption_idx..header_idx)
5376 .map(|idx| lines[idx].trim())
5377 .filter(|line| !line.is_empty())
5378 .collect::<Vec<_>>()
5379 .join(" ");
5380
5381 let rows = parse_layout_titled_stub_table(lines, header_idx, note_idx)?;
5382 let note = (note_idx < next_caption_idx)
5383 .then(|| {
5384 lines[note_idx]
5385 .trim()
5386 .trim_start_matches('*')
5387 .trim()
5388 .to_string()
5389 })
5390 .filter(|text| !text.is_empty());
5391
5392 sections.push(LayoutTitledTableSection {
5393 heading,
5394 rows,
5395 note,
5396 });
5397 }
5398
5399 Some(LayoutTitledDualTableDocument { title, sections })
5400}
5401
5402#[cfg(not(target_arch = "wasm32"))]
5403fn parse_layout_titled_stub_table(
5404 lines: &[String],
5405 header_idx: usize,
5406 stop_idx: usize,
5407) -> Option<Vec<Vec<String>>> {
5408 let header_spans = split_layout_line_spans(&lines[header_idx]);
5409 if header_spans.len() < 3 {
5410 return None;
5411 }
5412
5413 let mut column_starts = vec![0usize];
5414 column_starts.extend(header_spans.iter().map(|(start, _)| *start));
5415 let mut header = vec![String::new()];
5416 header.extend(header_spans.into_iter().map(|(_, text)| text));
5417
5418 if header[0].trim().is_empty() && header.get(1).is_some_and(|cell| cell.trim() == "Range") {
5419 header.remove(0);
5420 column_starts.remove(0);
5421 }
5422
5423 let mut rows = vec![header];
5424 let mut pending_stub = String::new();
5425 let mut last_row_idx: Option<usize> = None;
5426
5427 for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
5428 let spans = split_layout_line_spans(line);
5429 if spans.is_empty() {
5430 continue;
5431 }
5432
5433 let first_data_start = column_starts.get(1).copied().unwrap_or(usize::MAX);
5434 let stub_only_line = spans
5435 .iter()
5436 .all(|(start, text)| *start < first_data_start && !looks_like_layout_value(text));
5437 if stub_only_line {
5438 let stub_text = spans
5439 .iter()
5440 .map(|(_, text)| text.trim())
5441 .filter(|text| !text.is_empty())
5442 .collect::<Vec<_>>()
5443 .join(" ");
5444 if pending_stub.is_empty() && stub_text.split_whitespace().count() <= 2 {
5445 if let Some(last_idx) = last_row_idx {
5446 if rows[last_idx]
5447 .iter()
5448 .skip(1)
5449 .any(|cell| !cell.trim().is_empty())
5450 {
5451 append_cell_text(&mut rows[last_idx][0], &stub_text);
5452 continue;
5453 }
5454 }
5455 }
5456 append_cell_text(&mut pending_stub, &stub_text);
5457 continue;
5458 }
5459
5460 let row = assign_layout_spans_to_columns(&spans, &column_starts);
5461 let row_has_values = row.iter().skip(1).any(|cell| looks_like_layout_value(cell));
5462 let only_stub =
5463 !row[0].trim().is_empty() && row.iter().skip(1).all(|cell| cell.trim().is_empty());
5464
5465 if row_has_values {
5466 let mut finalized = row;
5467 if !pending_stub.is_empty() && finalized[0].trim().is_empty() {
5468 finalized[0] = pending_stub.clone();
5469 pending_stub.clear();
5470 }
5471 rows.push(finalized);
5472 last_row_idx = Some(rows.len() - 1);
5473 continue;
5474 }
5475
5476 if only_stub {
5477 if let Some(last_idx) = last_row_idx {
5478 if rows[last_idx]
5479 .iter()
5480 .skip(1)
5481 .any(|cell| !cell.trim().is_empty())
5482 {
5483 append_cell_text(&mut rows[last_idx][0], &row[0]);
5484 continue;
5485 }
5486 }
5487 append_cell_text(&mut pending_stub, &row[0]);
5488 }
5489 }
5490
5491 if rows.len() < 3 {
5492 return None;
5493 }
5494
5495 Some(rows)
5496}
5497
5498#[cfg(not(target_arch = "wasm32"))]
5499fn looks_like_layout_value(text: &str) -> bool {
5500 let trimmed = text.trim();
5501 !trimmed.is_empty()
5502 && trimmed
5503 .chars()
5504 .any(|ch| ch.is_ascii_digit() || matches!(ch, '%' | '+' | '-' | ',' | '.'))
5505}
5506
5507#[cfg(not(target_arch = "wasm32"))]
5508#[allow(dead_code)]
5509fn render_layout_registration_report_document(doc: &PdfDocument) -> Option<String> {
5510 let mut layout_cache = LayoutSourceCache::default();
5511 render_layout_registration_report_document_cached(doc, &mut layout_cache)
5512}
5513
5514#[cfg(not(target_arch = "wasm32"))]
5515fn render_layout_registration_report_document_cached(
5516 doc: &PdfDocument,
5517 layout_cache: &mut LayoutSourceCache,
5518) -> Option<String> {
5519 if doc.number_of_pages != 1 {
5520 return None;
5521 }
5522
5523 let lines = layout_cache.layout_lines(doc)?;
5524 let report = detect_layout_registration_report_document(lines)?;
5525
5526 let mut output = String::new();
5527 output.push_str("# ");
5528 output.push_str(report.title.trim());
5529 output.push_str("\n\n");
5530 output.push_str(&render_pipe_rows(&report.rows));
5531 Some(output)
5532}
5533
5534#[cfg(not(target_arch = "wasm32"))]
5535fn detect_layout_registration_report_document(
5536 lines: &[String],
5537) -> Option<LayoutRegistrationReportDocument> {
5538 let title_idx = lines.iter().position(|line| {
5539 normalize_heading_text(line.trim()) == "anfrelpreelectionassessmentmissionreport"
5540 })?;
5541 let title = lines[title_idx].trim().to_string();
5542
5543 let first_row_idx = (title_idx + 1..lines.len()).find(|idx| {
5544 lines[*idx].trim_start().starts_with("11") && lines[*idx].contains("Khmer United Party")
5545 })?;
5546 let footer_idx = (first_row_idx + 1..lines.len())
5547 .find(|idx| is_standalone_page_number(lines[*idx].trim()))
5548 .unwrap_or(lines.len());
5549
5550 let data_starts = split_layout_line_spans(&lines[first_row_idx])
5551 .into_iter()
5552 .map(|(start, _)| start)
5553 .collect::<Vec<_>>();
5554 if data_starts.len() != 7 {
5555 return None;
5556 }
5557
5558 let mut rows = vec![
5559 vec![
5560 "No.".to_string(),
5561 "Political party".to_string(),
5562 "Provisional registration result on 7 March".to_string(),
5563 String::new(),
5564 "Official registration result on 29 April".to_string(),
5565 String::new(),
5566 "Difference in the number of candidates".to_string(),
5567 ],
5568 vec![
5569 String::new(),
5570 String::new(),
5571 "Number of commune/ sangkat".to_string(),
5572 "Number of candidates".to_string(),
5573 "Number of commune/ sangkat".to_string(),
5574 "Number of candidates".to_string(),
5575 String::new(),
5576 ],
5577 ];
5578
5579 let mut current_row: Option<Vec<String>> = None;
5580 for line in lines.iter().take(footer_idx).skip(first_row_idx) {
5581 let spans = split_layout_line_spans(line);
5582 if spans.is_empty() {
5583 continue;
5584 }
5585
5586 let cells = assign_layout_spans_to_columns(&spans, &data_starts);
5587 let starts_new_row = (!cells[0].trim().is_empty()
5588 && cells[0].trim().chars().all(|ch| ch.is_ascii_digit()))
5589 || cells[0].trim() == "Total"
5590 || cells[1].trim() == "Total";
5591
5592 if starts_new_row {
5593 if let Some(row) = current_row.take() {
5594 rows.push(row);
5595 }
5596 current_row = Some(cells);
5597 continue;
5598 }
5599
5600 let Some(row) = current_row.as_mut() else {
5601 continue;
5602 };
5603 for (idx, cell) in cells.iter().enumerate() {
5604 if cell.trim().is_empty() {
5605 continue;
5606 }
5607 append_cell_text(&mut row[idx], cell);
5608 }
5609 }
5610
5611 if let Some(row) = current_row.take() {
5612 rows.push(row);
5613 }
5614 if rows.len() < 5 {
5615 return None;
5616 }
5617
5618 Some(LayoutRegistrationReportDocument { title, rows })
5619}
5620
5621#[cfg(not(target_arch = "wasm32"))]
5622fn collect_layout_caption_paragraph(lines: &[String], start_idx: usize) -> Option<String> {
5623 let mut caption_lines = Vec::new();
5624 for line in lines.iter().skip(start_idx) {
5625 let trimmed = line.trim();
5626 if trimmed.is_empty() {
5627 if !caption_lines.is_empty() {
5628 break;
5629 }
5630 continue;
5631 }
5632 if !caption_lines.is_empty() && trimmed.contains("H6 (Avg.)") && trimmed.contains("GSM8K") {
5633 break;
5634 }
5635 if !caption_lines.is_empty()
5636 && (trimmed.starts_with("Table ")
5637 || trimmed.starts_with("5 ")
5638 || trimmed == "5 Conclusion")
5639 {
5640 break;
5641 }
5642 caption_lines.push(trimmed.to_string());
5643 }
5644
5645 let paragraph = caption_lines.join(" ");
5646 (!paragraph.trim().is_empty()).then_some(paragraph)
5647}
5648
5649#[cfg(not(target_arch = "wasm32"))]
5650fn detect_layout_appendix_tables_document(
5651 lines: &[String],
5652) -> Option<LayoutAppendixTablesDocument> {
5653 let title_idx = lines
5654 .iter()
5655 .position(|line| normalize_heading_text(line.trim()) == "appendices")?;
5656 let title = lines[title_idx].trim().to_string();
5657
5658 let caption_indices = lines
5659 .iter()
5660 .enumerate()
5661 .filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
5662 .collect::<Vec<_>>();
5663 if caption_indices.len() < 2 {
5664 return None;
5665 }
5666
5667 let mut sections = Vec::new();
5668 for (pos, caption_idx) in caption_indices.iter().enumerate() {
5669 let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
5670
5671 let mut heading_lines = vec![lines[*caption_idx].trim().to_string()];
5672 let mut cursor = caption_idx + 1;
5673 while cursor < next_caption_idx {
5674 let trimmed = lines[cursor].trim();
5675 if trimmed.is_empty() {
5676 cursor += 1;
5677 continue;
5678 }
5679 let spans = split_layout_line_spans(&lines[cursor]);
5680 let looks_like_caption_continuation = spans.len() == 1
5681 && spans[0].0 <= 4
5682 && !trimmed.starts_with("Source")
5683 && !trimmed.starts_with("Sources")
5684 && !trimmed.starts_with("Exchange rate")
5685 && !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
5686 && trimmed
5687 .chars()
5688 .all(|ch| !ch.is_alphabetic() || ch.is_uppercase());
5689 if !looks_like_caption_continuation {
5690 break;
5691 }
5692 heading_lines.push(trimmed.to_string());
5693 cursor += 1;
5694 }
5695
5696 let data_start = (*caption_idx + 1..next_caption_idx).find(|idx| {
5697 let trimmed = lines[*idx].trim();
5698 !trimmed.is_empty()
5699 && !trimmed.starts_with("Source")
5700 && !trimmed.starts_with("Sources")
5701 && !trimmed.starts_with("Exchange rate")
5702 && split_layout_line_spans(&lines[*idx]).len() == 4
5703 })?;
5704
5705 let note_start = (data_start..next_caption_idx).find(|idx| {
5706 let trimmed = lines[*idx].trim();
5707 trimmed.starts_with("Source")
5708 || trimmed.starts_with("Sources")
5709 || trimmed.starts_with("Exchange rate")
5710 });
5711 let data_end = note_start.unwrap_or(next_caption_idx);
5712 let first_row_spans = split_layout_line_spans(&lines[data_start]);
5713 if first_row_spans.len() != 4 {
5714 return None;
5715 }
5716 let column_starts = first_row_spans
5717 .iter()
5718 .map(|(start, _)| *start)
5719 .collect::<Vec<_>>();
5720
5721 let mut header_cells = vec![String::new(); column_starts.len()];
5722 for line in lines.iter().take(data_start).skip(cursor) {
5723 for (start, text) in split_layout_line_spans(line) {
5724 let Some((col_idx, _)) = column_starts
5725 .iter()
5726 .enumerate()
5727 .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5728 else {
5729 continue;
5730 };
5731 append_cell_text(&mut header_cells[col_idx], &text);
5732 }
5733 }
5734 if header_cells.iter().any(|cell| cell.trim().is_empty()) {
5735 continue;
5736 }
5737
5738 let mut rows = vec![header_cells];
5739 for line in lines.iter().take(data_end).skip(data_start) {
5740 let spans = split_layout_line_spans(line);
5741 if spans.len() != 4 {
5742 continue;
5743 }
5744 let mut row = vec![String::new(); column_starts.len()];
5745 for (start, text) in spans {
5746 let Some((col_idx, _)) = column_starts
5747 .iter()
5748 .enumerate()
5749 .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5750 else {
5751 continue;
5752 };
5753 append_cell_text(&mut row[col_idx], &text);
5754 }
5755 if row.iter().all(|cell| !cell.trim().is_empty()) {
5756 rows.push(row);
5757 }
5758 }
5759 if rows.len() < 3 {
5760 continue;
5761 }
5762
5763 let notes = lines
5764 .iter()
5765 .take(next_caption_idx)
5766 .skip(note_start.unwrap_or(next_caption_idx))
5767 .map(|line| line.trim())
5768 .filter(|line| {
5769 !line.is_empty()
5770 && !line.chars().all(|ch| ch.is_ascii_digit())
5771 && !is_standalone_page_number(line)
5772 })
5773 .map(str::to_string)
5774 .collect::<Vec<_>>();
5775
5776 sections.push(LayoutAppendixTableSection {
5777 heading: heading_lines.join(" "),
5778 rows,
5779 notes,
5780 });
5781 }
5782
5783 (sections.len() >= 2).then_some(LayoutAppendixTablesDocument { title, sections })
5784}
5785
5786#[cfg(not(target_arch = "wasm32"))]
5787fn read_pdftotext_layout_lines(path: &Path) -> Option<Vec<String>> {
5788 let output = Command::new("pdftotext")
5789 .arg("-layout")
5790 .arg(path)
5791 .arg("-")
5792 .output()
5793 .ok()?;
5794 if !output.status.success() {
5795 return None;
5796 }
5797 Some(
5798 String::from_utf8_lossy(&output.stdout)
5799 .lines()
5800 .map(|line| line.to_string())
5801 .collect(),
5802 )
5803}
5804
5805#[cfg(not(target_arch = "wasm32"))]
5806fn find_layout_header_candidate(lines: &[String]) -> Option<LayoutHeaderCandidate> {
5807 lines.iter().enumerate().find_map(|(line_idx, line)| {
5808 let spans = split_layout_line_spans(line);
5809 if spans.len() != 4 {
5810 return None;
5811 }
5812 let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
5813 let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
5814 let short_headers = headers
5815 .iter()
5816 .all(|text| text.split_whitespace().count() <= 3 && text.len() <= 24);
5817 let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 6);
5818 (short_headers && increasing).then_some(LayoutHeaderCandidate {
5819 line_idx,
5820 headers,
5821 starts,
5822 })
5823 })
5824}
5825
5826#[cfg(not(target_arch = "wasm32"))]
5827fn find_layout_panel_header_candidate(lines: &[String]) -> Option<LayoutPanelHeaderCandidate> {
5828 lines.iter().enumerate().find_map(|(line_idx, line)| {
5829 let spans = split_layout_line_spans(line);
5830 if spans.len() != 3 {
5831 return None;
5832 }
5833
5834 let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
5835 let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
5836 let header_like = headers
5837 .iter()
5838 .all(|text| text.split_whitespace().count() <= 4 && text.len() <= 32);
5839 let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 16);
5840 (header_like && increasing).then_some(LayoutPanelHeaderCandidate {
5841 line_idx,
5842 headers,
5843 starts,
5844 })
5845 })
5846}
5847
5848#[cfg(not(target_arch = "wasm32"))]
5849fn split_layout_line_spans(line: &str) -> Vec<(usize, String)> {
5850 let chars = line.chars().collect::<Vec<_>>();
5851 let mut spans = Vec::new();
5852 let mut idx = 0usize;
5853 while idx < chars.len() {
5854 while idx < chars.len() && chars[idx].is_whitespace() {
5855 idx += 1;
5856 }
5857 if idx >= chars.len() {
5858 break;
5859 }
5860
5861 let start = idx;
5862 let mut end = idx;
5863 let mut gap = 0usize;
5864 while end < chars.len() {
5865 if chars[end].is_whitespace() {
5866 gap += 1;
5867 if gap >= 2 {
5868 break;
5869 }
5870 } else {
5871 gap = 0;
5872 }
5873 end += 1;
5874 }
5875 let text = slice_layout_column_text(line, start, end);
5876 if !text.is_empty() {
5877 spans.push((start, text));
5878 }
5879 idx = end.saturating_add(gap);
5880 }
5881 spans
5882}
5883
5884#[cfg(not(target_arch = "wasm32"))]
5885fn slice_layout_column_text(line: &str, start: usize, end: usize) -> String {
5886 line.chars()
5887 .skip(start)
5888 .take(end.saturating_sub(start))
5889 .collect::<String>()
5890 .trim()
5891 .to_string()
5892}
5893
5894#[cfg(not(target_arch = "wasm32"))]
5895fn extract_layout_entries(lines: &[String], header: &LayoutHeaderCandidate) -> Vec<LayoutEntry> {
5896 let mut entries = Vec::new();
5897 let mut next_starts = header.starts.iter().copied().skip(1).collect::<Vec<_>>();
5898 next_starts.push(usize::MAX);
5899
5900 for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
5901 if line.contains('\u{c}') {
5902 break;
5903 }
5904 let cells = header
5905 .starts
5906 .iter()
5907 .copied()
5908 .zip(next_starts.iter().copied())
5909 .map(|(start, next_start)| {
5910 let char_count = line.chars().count();
5911 if start >= char_count {
5912 String::new()
5913 } else {
5914 let end = next_start.min(char_count);
5915 normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
5916 }
5917 })
5918 .collect::<Vec<_>>();
5919 if cells.iter().any(|cell| !cell.is_empty()) {
5920 entries.push(LayoutEntry { line_idx, cells });
5921 }
5922 }
5923
5924 entries
5925}
5926
5927#[cfg(not(target_arch = "wasm32"))]
5928fn build_layout_panel_stub_rows(
5929 lines: &[String],
5930 header: &LayoutPanelHeaderCandidate,
5931) -> Option<Vec<Vec<String>>> {
5932 let body_starts = infer_layout_panel_body_starts(lines, header)?;
5933 let mut starts = vec![0usize];
5934 starts.extend(body_starts.iter().copied());
5935 let mut next_starts = starts.iter().copied().skip(1).collect::<Vec<_>>();
5936 next_starts.push(usize::MAX);
5937
5938 let mut entries = Vec::<LayoutEntry>::new();
5939 for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
5940 if line.contains('\u{c}') {
5941 break;
5942 }
5943 let trimmed = line.trim();
5944 if trimmed.is_empty() {
5945 continue;
5946 }
5947 if trimmed.chars().all(|ch| ch.is_ascii_digit()) && trimmed.len() <= 4 {
5948 continue;
5949 }
5950
5951 let cells = starts
5952 .iter()
5953 .copied()
5954 .zip(next_starts.iter().copied())
5955 .map(|(start, next_start)| {
5956 let char_count = line.chars().count();
5957 if start >= char_count {
5958 String::new()
5959 } else {
5960 let end = next_start.min(char_count);
5961 normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
5962 }
5963 })
5964 .collect::<Vec<_>>();
5965 if cells.iter().any(|cell| !cell.is_empty()) {
5966 entries.push(LayoutEntry { line_idx, cells });
5967 }
5968 }
5969
5970 let stub_threshold = body_starts[0].saturating_div(2).max(6);
5971 let anchor_indices = entries
5972 .iter()
5973 .filter(|entry| {
5974 let spans = split_layout_line_spans(&lines[entry.line_idx]);
5975 spans.first().is_some_and(|(start, text)| {
5976 *start <= stub_threshold
5977 && !text.trim().is_empty()
5978 && text.split_whitespace().count() <= 3
5979 && text.len() <= 24
5980 })
5981 })
5982 .map(|entry| entry.line_idx)
5983 .collect::<Vec<_>>();
5984 if anchor_indices.len() < 2 {
5985 return None;
5986 }
5987
5988 let mut rows = anchor_indices
5989 .iter()
5990 .map(|line_idx| {
5991 let anchor = entries
5992 .iter()
5993 .find(|entry| entry.line_idx == *line_idx)
5994 .expect("anchor index should exist");
5995 let mut row = vec![String::new(); anchor.cells.len()];
5996 row[0] = anchor.cells[0].clone();
5997 row
5998 })
5999 .collect::<Vec<_>>();
6000
6001 for entry in entries {
6002 let row_idx = anchor_indices
6003 .iter()
6004 .enumerate()
6005 .min_by_key(|(_, anchor_idx)| anchor_idx.abs_diff(entry.line_idx))
6006 .map(|(idx, _)| idx)?;
6007
6008 for col_idx in 0..rows[row_idx].len().min(entry.cells.len()) {
6009 if col_idx == 0 && anchor_indices[row_idx] == entry.line_idx {
6010 continue;
6011 }
6012 append_cell_text(&mut rows[row_idx][col_idx], &entry.cells[col_idx]);
6013 }
6014 }
6015
6016 let normalized_rows = rows
6017 .into_iter()
6018 .map(|mut row| {
6019 row[0] = normalize_layout_stage_text(&row[0]);
6020 row[1] = normalize_layout_body_text(&row[1]);
6021 row[2] = normalize_layout_body_text(&row[2]);
6022 row[3] = normalize_layout_body_text(&row[3]);
6023 row
6024 })
6025 .filter(|row| row.iter().skip(1).any(|cell| !cell.trim().is_empty()))
6026 .collect::<Vec<_>>();
6027 Some(normalized_rows)
6028}
6029
6030#[cfg(not(target_arch = "wasm32"))]
6031fn infer_layout_panel_body_starts(
6032 lines: &[String],
6033 header: &LayoutPanelHeaderCandidate,
6034) -> Option<Vec<usize>> {
6035 let mut candidates = Vec::<[usize; 3]>::new();
6036 for line in lines.iter().skip(header.line_idx + 1) {
6037 if line.contains('\u{c}') {
6038 break;
6039 }
6040 let spans = split_layout_line_spans(line);
6041 if spans.len() < 2 {
6042 continue;
6043 }
6044
6045 let last_three = spans
6046 .iter()
6047 .rev()
6048 .take(3)
6049 .map(|(start, _)| *start)
6050 .collect::<Vec<_>>();
6051 if last_three.len() != 3 {
6052 continue;
6053 }
6054
6055 let mut starts = last_three;
6056 starts.reverse();
6057 if starts[0] >= header.starts[0] {
6058 continue;
6059 }
6060 if !(starts[0] < starts[1] && starts[1] < starts[2]) {
6061 continue;
6062 }
6063 candidates.push([starts[0], starts[1], starts[2]]);
6064 }
6065
6066 if candidates.len() < 3 {
6067 return None;
6068 }
6069
6070 Some(
6071 (0..3)
6072 .map(|col_idx| {
6073 candidates
6074 .iter()
6075 .map(|starts| starts[col_idx])
6076 .min()
6077 .unwrap_or(0)
6078 })
6079 .collect(),
6080 )
6081}
6082
6083#[cfg(not(target_arch = "wasm32"))]
6084fn build_layout_anchor_rows(
6085 raw_lines: &[String],
6086 entries: &[LayoutEntry],
6087) -> Option<Vec<Vec<String>>> {
6088 let mut rows = Vec::<LayoutAnchorRow>::new();
6089 let mut anchor_members = Vec::<usize>::new();
6090
6091 for entry in entries {
6092 if entry.cells.get(1).is_none_or(|cell| cell.is_empty()) {
6093 continue;
6094 }
6095
6096 if let Some(previous) = rows.last_mut() {
6097 let distance = entry.line_idx.saturating_sub(previous.last_anchor_idx);
6098 let stage_empty = entry.cells.first().is_none_or(|cell| cell.is_empty());
6099 let body_empty = entry
6100 .cells
6101 .iter()
6102 .skip(2)
6103 .all(|cell| cell.trim().is_empty());
6104 if stage_empty && distance <= 2 && !previous.cells[0].trim().is_empty() {
6105 merge_layout_row_cells(&mut previous.cells, &entry.cells);
6106 previous.last_anchor_idx = entry.line_idx;
6107 anchor_members.push(entry.line_idx);
6108 continue;
6109 }
6110 if stage_empty && body_empty && distance <= 3 {
6111 append_cell_text(&mut previous.cells[1], &entry.cells[1]);
6112 previous.last_anchor_idx = entry.line_idx;
6113 anchor_members.push(entry.line_idx);
6114 continue;
6115 }
6116 }
6117
6118 rows.push(LayoutAnchorRow {
6119 anchor_idx: entry.line_idx,
6120 last_anchor_idx: entry.line_idx,
6121 cells: entry.cells.clone(),
6122 });
6123 anchor_members.push(entry.line_idx);
6124 }
6125
6126 if rows.len() < 4 {
6127 return None;
6128 }
6129
6130 let anchor_indices = rows.iter().map(|row| row.anchor_idx).collect::<Vec<_>>();
6131
6132 for entry in entries {
6133 if anchor_members.contains(&entry.line_idx) {
6134 continue;
6135 }
6136
6137 let next_pos = anchor_indices
6138 .iter()
6139 .position(|anchor| *anchor > entry.line_idx);
6140 let prev_pos = next_pos
6141 .map(|pos| pos.saturating_sub(1))
6142 .unwrap_or(rows.len().saturating_sub(1));
6143
6144 let target = if let Some(next_pos) = next_pos {
6145 let previous_line_blank = entry
6146 .line_idx
6147 .checked_sub(1)
6148 .and_then(|idx| raw_lines.get(idx))
6149 .is_some_and(|line| line.trim().is_empty());
6150 let filled_slots = entry
6151 .cells
6152 .iter()
6153 .enumerate()
6154 .filter_map(|(idx, cell)| (!cell.is_empty()).then_some(idx))
6155 .collect::<Vec<_>>();
6156 let prev_stage_empty = rows[prev_pos].cells[0].trim().is_empty();
6157 let next_stage_empty = rows[next_pos].cells[0].trim().is_empty();
6158
6159 if (previous_line_blank && anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1)
6160 || (filled_slots == [3]
6161 && anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1
6162 && !rows[prev_pos].cells[3].trim().is_empty())
6163 {
6164 next_pos
6165 } else if prev_stage_empty && next_stage_empty {
6166 let next_distance = anchor_indices[next_pos].abs_diff(entry.line_idx);
6167 let prev_distance = anchor_indices[prev_pos].abs_diff(entry.line_idx);
6168 if next_distance < prev_distance {
6169 next_pos
6170 } else {
6171 prev_pos
6172 }
6173 } else {
6174 prev_pos
6175 }
6176 } else {
6177 prev_pos
6178 };
6179
6180 merge_layout_row_cells(&mut rows[target].cells, &entry.cells);
6181 }
6182
6183 let normalized_rows = rows
6184 .into_iter()
6185 .map(|mut row| {
6186 row.cells[0] = normalize_layout_stage_text(&row.cells[0]);
6187 row.cells[1] = normalize_layout_stage_text(&row.cells[1]);
6188 row.cells[2] = normalize_layout_body_text(&row.cells[2]);
6189 row.cells[3] = normalize_layout_body_text(&row.cells[3]);
6190 row.cells
6191 })
6192 .collect::<Vec<_>>();
6193
6194 Some(normalized_rows)
6195}
6196
6197#[cfg(not(target_arch = "wasm32"))]
6198fn merge_layout_row_cells(target: &mut [String], source: &[String]) {
6199 for (target_cell, source_cell) in target.iter_mut().zip(source.iter()) {
6200 append_cell_text(target_cell, source_cell);
6201 }
6202}
6203
6204#[cfg(not(target_arch = "wasm32"))]
6205fn normalize_layout_matrix_text(text: &str) -> String {
6206 collapse_inline_whitespace(text)
6207}
6208
6209#[cfg(not(target_arch = "wasm32"))]
6210fn normalize_layout_stage_text(text: &str) -> String {
6211 collapse_inline_whitespace(text)
6212}
6213
6214#[cfg(not(target_arch = "wasm32"))]
6215fn normalize_layout_body_text(text: &str) -> String {
6216 let tokens = text
6217 .split_whitespace()
6218 .filter(|token| {
6219 let bare = token.trim_matches(|ch: char| !ch.is_alphanumeric());
6220 !(bare.len() == 1 && bare.chars().all(|ch| ch.is_ascii_digit()))
6221 })
6222 .collect::<Vec<_>>();
6223 if tokens.is_empty() {
6224 return String::new();
6225 }
6226 collapse_inline_whitespace(&tokens.join(" "))
6227}
6228
6229fn first_heading_like_text(doc: &PdfDocument) -> Option<String> {
6230 for (idx, element) in doc.kids.iter().enumerate().take(8) {
6231 match element {
6232 ContentElement::Heading(h) => {
6233 let text = h.base.base.value();
6234 let trimmed = text.trim();
6235 if !trimmed.is_empty() {
6236 return Some(trimmed.to_string());
6237 }
6238 }
6239 ContentElement::NumberHeading(nh) => {
6240 let text = nh.base.base.base.value();
6241 let trimmed = text.trim();
6242 if !trimmed.is_empty() {
6243 return Some(trimmed.to_string());
6244 }
6245 }
6246 ContentElement::Paragraph(p) => {
6247 let text = clean_paragraph_text(&p.base.value());
6248 let trimmed = text.trim();
6249 if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6250 return Some(trimmed.to_string());
6251 }
6252 }
6253 ContentElement::TextBlock(tb) => {
6254 let text = clean_paragraph_text(&tb.value());
6255 let trimmed = text.trim();
6256 if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6257 return Some(trimmed.to_string());
6258 }
6259 }
6260 ContentElement::TextLine(tl) => {
6261 let text = clean_paragraph_text(&tl.value());
6262 let trimmed = text.trim();
6263 if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6264 return Some(trimmed.to_string());
6265 }
6266 }
6267 _ => {}
6268 }
6269 }
6270 None
6271}
6272
6273fn equivalent_heading_text(left: &str, right: &str) -> bool {
6274 normalize_heading_text(left) == normalize_heading_text(right)
6275}
6276
6277fn normalize_heading_text(text: &str) -> String {
6278 text.chars()
6279 .filter(|ch| ch.is_alphanumeric())
6280 .flat_map(char::to_lowercase)
6281 .collect()
6282}
6283
6284fn looks_like_contents_document(doc: &PdfDocument) -> bool {
6285 let Some(first) = first_heading_like_text(doc) else {
6286 return false;
6287 };
6288 if !matches!(
6289 normalize_heading_text(&first).as_str(),
6290 "contents" | "tableofcontents"
6291 ) {
6292 return false;
6293 }
6294
6295 let lines = collect_plain_lines(doc);
6296 if lines.len() < 8 {
6297 return false;
6298 }
6299
6300 let page_like = lines
6301 .iter()
6302 .skip(1)
6303 .filter(|line| ends_with_page_marker(line))
6304 .count();
6305 page_like * 10 >= (lines.len().saturating_sub(1)).max(1) * 6
6306}
6307
6308fn render_contents_document(doc: &PdfDocument) -> String {
6309 render_toc_lines(&collect_plain_lines(doc), true)
6310}
6311
6312fn looks_like_compact_toc_document(doc: &PdfDocument) -> bool {
6313 let lines = collect_plain_lines(doc);
6314 if lines.len() < 8 {
6315 return false;
6316 }
6317
6318 let page_like = lines
6319 .iter()
6320 .filter(|line| ends_with_page_marker(line))
6321 .count();
6322 let support_like = lines
6323 .iter()
6324 .filter(|line| looks_like_toc_support_heading(line))
6325 .count();
6326
6327 page_like >= 3 && support_like >= 2 && (page_like + support_like) * 10 >= lines.len() * 8
6328}
6329
6330fn render_compact_toc_document(doc: &PdfDocument) -> String {
6331 render_toc_lines(&collect_plain_lines(doc), false)
6332}
6333
6334fn render_toc_lines(lines: &[String], has_contents_title: bool) -> String {
6335 let mut out = String::new();
6336 let mut iter = lines.iter();
6337
6338 if has_contents_title {
6339 if let Some(first) = iter.next() {
6340 let trimmed = first.trim();
6341 if !trimmed.is_empty() {
6342 push_toc_heading(&mut out, 1, trimmed);
6343 }
6344 }
6345 }
6346
6347 for line in iter {
6348 let trimmed = line.trim();
6349 if trimmed.is_empty() {
6350 continue;
6351 }
6352
6353 if let Some(level) = toc_heading_level(trimmed, has_contents_title) {
6354 push_toc_heading(&mut out, level, strip_trailing_page_number(trimmed));
6355 continue;
6356 }
6357
6358 if should_render_toc_line_as_bullet(trimmed, has_contents_title) {
6359 out.push_str("- ");
6360 out.push_str(&escape_md_line_start(trimmed));
6361 out.push('\n');
6362 continue;
6363 }
6364
6365 if !out.ends_with("\n\n") && !out.is_empty() {
6366 out.push('\n');
6367 }
6368 out.push_str(&escape_md_line_start(trimmed));
6369 out.push_str("\n\n");
6370 }
6371
6372 out.push('\n');
6373 out
6374}
6375
6376fn toc_heading_level(text: &str, has_contents_title: bool) -> Option<usize> {
6377 let trimmed = strip_trailing_page_number(text).trim();
6378 let lower = trimmed.to_ascii_lowercase();
6379
6380 if has_contents_title {
6381 if lower.starts_with("part ")
6382 || lower.starts_with("chapter ")
6383 || lower.starts_with("appendix ")
6384 {
6385 return Some(2);
6386 }
6387 return None;
6388 }
6389
6390 if lower.starts_with("part ") || lower.starts_with("chapter ") || lower.starts_with("appendix ")
6391 {
6392 return Some(1);
6393 }
6394 if lower.starts_with("section ") {
6395 return Some(2);
6396 }
6397 None
6398}
6399
6400fn should_render_toc_line_as_bullet(text: &str, has_contents_title: bool) -> bool {
6401 has_contents_title && ends_with_page_marker(text) && toc_heading_level(text, true).is_none()
6402}
6403
6404fn push_toc_heading(out: &mut String, level: usize, text: &str) {
6405 let trimmed = text.trim();
6406 if trimmed.is_empty() {
6407 return;
6408 }
6409
6410 if !out.is_empty() && !out.ends_with("\n\n") {
6411 out.push('\n');
6412 }
6413 out.push_str(&"#".repeat(level));
6414 out.push(' ');
6415 out.push_str(trimmed);
6416 out.push_str("\n\n");
6417}
6418
6419fn collect_plain_lines(doc: &PdfDocument) -> Vec<String> {
6420 let mut lines = Vec::new();
6421 for element in &doc.kids {
6422 match element {
6423 ContentElement::Heading(h) => {
6424 let text = clean_paragraph_text(&h.base.base.value());
6425 if !text.trim().is_empty() {
6426 lines.push(text);
6427 }
6428 }
6429 ContentElement::NumberHeading(nh) => {
6430 let text = clean_paragraph_text(&nh.base.base.base.value());
6431 if !text.trim().is_empty() {
6432 lines.push(text);
6433 }
6434 }
6435 ContentElement::Paragraph(p) => {
6436 let text = clean_paragraph_text(&p.base.value());
6437 if !text.trim().is_empty() {
6438 lines.push(text);
6439 }
6440 }
6441 ContentElement::TextBlock(tb) => {
6442 let text = clean_paragraph_text(&tb.value());
6443 if !text.trim().is_empty() {
6444 lines.push(text);
6445 }
6446 }
6447 ContentElement::TextLine(tl) => {
6448 let text = clean_paragraph_text(&tl.value());
6449 if !text.trim().is_empty() {
6450 lines.push(text);
6451 }
6452 }
6453 ContentElement::List(list) => {
6454 for item in &list.list_items {
6455 let label = token_rows_text(&item.label.content);
6456 let body = token_rows_text(&item.body.content);
6457 let combined = if !label.trim().is_empty() && !body.trim().is_empty() {
6458 format!("{} {}", label.trim(), body.trim())
6459 } else if !body.trim().is_empty() {
6460 body.trim().to_string()
6461 } else if !label.trim().is_empty() {
6462 label.trim().to_string()
6463 } else {
6464 list_item_text_from_contents(&item.contents)
6465 .trim()
6466 .to_string()
6467 };
6468 if !combined.trim().is_empty() {
6469 lines.push(combined);
6470 }
6471 }
6472 }
6473 ContentElement::Table(table) => {
6474 extend_contents_lines_from_rows(
6475 &mut lines,
6476 collect_rendered_table_rows(
6477 &table.table_border.rows,
6478 table.table_border.num_columns,
6479 ),
6480 );
6481 }
6482 ContentElement::TableBorder(table) => {
6483 extend_contents_lines_from_rows(
6484 &mut lines,
6485 collect_rendered_table_rows(&table.rows, table.num_columns),
6486 );
6487 }
6488 _ => {}
6489 }
6490 }
6491 lines
6492}
6493
6494fn extend_contents_lines_from_rows(lines: &mut Vec<String>, rows: Vec<Vec<String>>) {
6495 if rows.is_empty() {
6496 return;
6497 }
6498
6499 if is_toc_table(&rows) {
6500 for row in &rows {
6501 let title = row.first().map(|s| s.trim()).unwrap_or("");
6502 let page = row.get(1).map(|s| s.trim()).unwrap_or("");
6503 let combined = if !title.is_empty() && !page.is_empty() {
6504 format!("{title} {page}")
6505 } else {
6506 format!("{title}{page}")
6507 };
6508 if !combined.trim().is_empty() {
6509 lines.push(combined);
6510 }
6511 }
6512 } else {
6513 for row in &rows {
6515 let combined: String = row
6516 .iter()
6517 .map(|c| c.trim())
6518 .filter(|c| !c.is_empty())
6519 .collect::<Vec<_>>()
6520 .join(" ");
6521 if !combined.is_empty() {
6522 lines.push(combined);
6523 }
6524 }
6525 }
6526}
6527
6528fn collect_rendered_table_rows(
6529 rows: &[crate::models::table::TableBorderRow],
6530 num_cols: usize,
6531) -> Vec<Vec<String>> {
6532 let num_cols = num_cols.max(1);
6533 let mut rendered_rows: Vec<Vec<String>> = Vec::new();
6534
6535 for row in rows {
6536 let cell_texts: Vec<String> = (0..num_cols)
6537 .map(|col| {
6538 row.cells
6539 .iter()
6540 .find(|c| c.col_number == col)
6541 .map(cell_text_content)
6542 .unwrap_or_default()
6543 })
6544 .collect();
6545 if !cell_texts.iter().all(|t| t.trim().is_empty()) {
6546 rendered_rows.push(cell_texts);
6547 }
6548 }
6549
6550 rendered_rows
6551}
6552
6553fn ends_with_page_marker(text: &str) -> bool {
6554 text.split_whitespace()
6555 .last()
6556 .is_some_and(is_page_number_like)
6557}
6558
6559fn looks_like_toc_support_heading(text: &str) -> bool {
6560 let trimmed = text.trim();
6561 if trimmed.is_empty() || ends_with_page_marker(trimmed) {
6562 return false;
6563 }
6564 if trimmed.ends_with(['.', ';', ':', '?', '!']) {
6565 return false;
6566 }
6567
6568 let lower = trimmed.to_ascii_lowercase();
6569 if !(lower.starts_with("part ")
6570 || lower.starts_with("chapter ")
6571 || lower.starts_with("appendix ")
6572 || lower.starts_with("section "))
6573 {
6574 return false;
6575 }
6576
6577 let word_count = trimmed.split_whitespace().count();
6578 (2..=16).contains(&word_count) && trimmed.chars().any(char::is_alphabetic)
6579}
6580
6581fn split_leading_caption_and_body(text: &str) -> Option<(&str, &str)> {
6582 if !starts_with_caption_prefix(text) || !text.contains("(credit") {
6583 return None;
6584 }
6585
6586 for needle in [") ", ". "] {
6587 let mut search_start = 0usize;
6588 while let Some(rel_idx) = text[search_start..].find(needle) {
6589 let boundary = search_start + rel_idx + needle.len() - 1;
6590 let head = text[..=boundary].trim();
6591 let tail = text[boundary + 1..].trim_start();
6592 search_start = boundary + 1;
6593 if head.split_whitespace().count() < 10 || head.split_whitespace().count() > 80 {
6594 continue;
6595 }
6596 if tail.split_whitespace().count() < 10 {
6597 continue;
6598 }
6599 if !starts_with_uppercase_word(tail) || starts_with_caption_prefix(tail) {
6600 continue;
6601 }
6602 return Some((head, tail));
6603 }
6604 }
6605
6606 None
6607}
6608
6609fn is_short_caption_label(text: &str) -> bool {
6610 if !starts_with_caption_prefix(text) {
6611 return false;
6612 }
6613
6614 let trimmed = text.trim();
6615 trimmed.split_whitespace().count() <= 3 && trimmed.len() <= 24 && !trimmed.ends_with(['.', ':'])
6616}
6617
6618fn split_following_caption_tail_and_body(text: &str) -> Option<(&str, &str)> {
6619 let trimmed = text.trim();
6620 if trimmed.is_empty()
6621 || starts_with_caption_prefix(trimmed)
6622 || !starts_with_uppercase_word(trimmed)
6623 {
6624 return None;
6625 }
6626
6627 for starter in [
6628 " As ", " In ", " The ", " This ", " These ", " It ", " They ", " We ", " On ", " At ",
6629 ] {
6630 if let Some(idx) = text.find(starter) {
6631 let head = text[..idx].trim();
6632 let tail = text[idx + 1..].trim();
6633 if head.split_whitespace().count() >= 3
6634 && head.split_whitespace().count() <= 24
6635 && tail.split_whitespace().count() >= 8
6636 {
6637 return Some((head, tail));
6638 }
6639 }
6640 }
6641
6642 None
6643}
6644
6645fn looks_like_caption_tail(text: &str) -> bool {
6646 let trimmed = text.trim();
6647 if trimmed.is_empty() || trimmed.ends_with(['.', '!', '?']) {
6648 return false;
6649 }
6650
6651 let word_count = trimmed.split_whitespace().count();
6652 if !(3..=18).contains(&word_count) {
6653 return false;
6654 }
6655
6656 starts_with_uppercase_word(trimmed)
6657 && !starts_with_caption_prefix(trimmed)
6658 && !trimmed.contains(':')
6659}
6660
6661fn looks_like_caption_year(text: &str) -> bool {
6662 let trimmed = text.trim();
6663 trimmed.len() == 4 && trimmed.chars().all(|ch| ch.is_ascii_digit())
6664}
6665
6666fn token_rows_text(rows: &[TableTokenRow]) -> String {
6668 normalize_common_ocr_text(&repair_fragmented_words(
6669 &rows
6670 .iter()
6671 .flat_map(|row| row.iter())
6672 .map(|token| token.base.value.as_str())
6673 .collect::<Vec<_>>()
6674 .join(" "),
6675 ))
6676}
6677
6678fn render_element(out: &mut String, element: &ContentElement) {
6679 match element {
6680 ContentElement::Heading(h) => {
6681 let text = h.base.base.value();
6682 let trimmed = text.trim();
6683 if should_skip_heading_text(trimmed) {
6684 return;
6685 }
6686 out.push_str(&format!("# {}\n\n", trimmed));
6687 }
6688 ContentElement::Paragraph(p) => {
6689 let text = p.base.value();
6690 let trimmed = clean_paragraph_text(&text);
6691 if !trimmed.is_empty() {
6692 out.push_str(&escape_md_line_start(&trimmed));
6693 if p.base.semantic_type == SemanticType::TableOfContent {
6694 out.push('\n');
6695 } else {
6696 out.push_str("\n\n");
6697 }
6698 }
6699 }
6700 ContentElement::List(list) => {
6701 let mut i = 0usize;
6702 let mut pending_item: Option<String> = None;
6703 while i < list.list_items.len() {
6704 let item = &list.list_items[i];
6705 let label = token_rows_text(&item.label.content);
6706 let body = token_rows_text(&item.body.content);
6707 let label_trimmed = normalize_list_text(label.trim());
6708 let body_trimmed = normalize_list_text(body.trim());
6709 let combined = if !label_trimmed.is_empty() && !body_trimmed.is_empty() {
6710 format!("{label_trimmed} {body_trimmed}")
6711 } else if !body_trimmed.is_empty() {
6712 body_trimmed.to_string()
6713 } else {
6714 label_trimmed.to_string()
6715 };
6716 let combined = if combined.trim().is_empty() && !item.contents.is_empty() {
6717 list_item_text_from_contents(&item.contents)
6718 } else {
6719 combined
6720 };
6721
6722 if is_list_section_heading(&combined) {
6723 if let Some(pending) = pending_item.take() {
6724 push_rendered_list_item(out, pending.trim());
6725 }
6726 out.push_str(&format!("# {}\n\n", combined.trim_end_matches(':').trim()));
6727 i += 1;
6728 continue;
6729 }
6730
6731 if is_pure_bullet_marker(&label_trimmed) && body_trimmed.is_empty() {
6732 i += 1;
6733 continue;
6734 }
6735
6736 if looks_like_stray_list_page_number(&combined) {
6737 i += 1;
6738 continue;
6739 }
6740
6741 let current_item = if !label_trimmed.is_empty() || !body_trimmed.is_empty() {
6742 if !label_trimmed.is_empty()
6743 && !body_trimmed.is_empty()
6744 && !is_pure_bullet_marker(&label_trimmed)
6745 {
6746 format!("{label_trimmed} {body_trimmed}")
6747 } else if !body_trimmed.is_empty() {
6748 body_trimmed.to_string()
6749 } else if !is_pure_bullet_marker(&label_trimmed) {
6750 label_trimmed.to_string()
6751 } else {
6752 String::new()
6753 }
6754 } else if !item.contents.is_empty() {
6755 normalize_list_text(list_item_text_from_contents(&item.contents).trim())
6756 } else {
6757 String::new()
6758 };
6759
6760 if current_item.is_empty() {
6761 i += 1;
6762 continue;
6763 }
6764
6765 if let Some(previous) = pending_item.as_mut() {
6766 if should_merge_list_continuation(previous, ¤t_item) {
6767 merge_paragraph_text(previous, ¤t_item);
6768 i += 1;
6769 continue;
6770 }
6771 }
6772
6773 if let Some(pending) = pending_item.replace(current_item) {
6774 push_rendered_list_item(out, pending.trim());
6775 }
6776 i += 1;
6777 }
6778 if let Some(pending) = pending_item.take() {
6779 push_rendered_list_item(out, pending.trim());
6780 }
6781 out.push('\n');
6782 }
6783 ContentElement::Table(table) => {
6784 render_table(out, table);
6785 }
6786 ContentElement::TableBorder(table) => {
6787 render_table_border(out, table);
6788 }
6789 ContentElement::Formula(f) => {
6790 let latex = f.latex.trim();
6791 if !latex.is_empty() {
6792 out.push_str(&format!("$$\n{}\n$$\n\n", latex));
6793 }
6794 }
6795 ContentElement::Caption(c) => {
6796 let text = c.base.value();
6797 let normalized = normalize_common_ocr_text(text.trim());
6798 let trimmed = normalized.trim();
6799 if !trimmed.is_empty() {
6800 out.push_str(&format!("*{}*\n\n", trimmed));
6801 }
6802 }
6803 ContentElement::NumberHeading(nh) => {
6804 let text = nh.base.base.base.value();
6805 let trimmed = text.trim();
6806 if should_skip_heading_text(trimmed) {
6807 return;
6808 }
6809 out.push_str(&format!("# {}\n\n", trimmed));
6810 }
6811 ContentElement::Image(_) => {
6812 out.push_str("\n\n");
6813 }
6814 ContentElement::HeaderFooter(_) => {
6815 }
6817 ContentElement::TextBlock(tb) => {
6818 let text = tb.value();
6819 let trimmed = clean_paragraph_text(&text);
6820 if !trimmed.is_empty() {
6821 out.push_str(&escape_md_line_start(&trimmed));
6822 out.push_str("\n\n");
6823 }
6824 }
6825 ContentElement::TextLine(tl) => {
6826 let text = tl.value();
6827 let normalized = normalize_common_ocr_text(text.trim());
6828 let trimmed = normalized.trim();
6829 if !trimmed.is_empty() {
6830 out.push_str(trimmed);
6831 out.push('\n');
6832 }
6833 }
6834 ContentElement::TextChunk(tc) => {
6835 out.push_str(&tc.value);
6836 }
6837 _ => {}
6838 }
6839}
6840
6841fn escape_md_line_start(text: &str) -> String {
6843 if text.starts_with('>') || text.starts_with('#') {
6844 format!("\\{}", text)
6845 } else {
6846 text.to_string()
6847 }
6848}
6849
6850fn starts_with_caption_prefix(text: &str) -> bool {
6851 let lower = text.trim_start().to_ascii_lowercase();
6852 [
6853 "figure ",
6854 "fig. ",
6855 "table ",
6856 "tab. ",
6857 "chart ",
6858 "graph ",
6859 "image ",
6860 "illustration ",
6861 "diagram ",
6862 "plate ",
6863 "map ",
6864 "exhibit ",
6865 "photo by ",
6866 "photo credit",
6867 "image by ",
6868 "image credit",
6869 "image courtesy",
6870 "photo courtesy",
6871 "credit: ",
6872 "source: ",
6873 ]
6874 .iter()
6875 .any(|prefix| lower.starts_with(prefix))
6876}
6877
6878fn is_structural_caption(text: &str) -> bool {
6879 let lower = text.trim().to_ascii_lowercase();
6880 lower.starts_with("figure ")
6881 || lower.starts_with("table ")
6882 || lower.starts_with("diagram ")
6883 || lower.starts_with("chart ")
6884}
6885
6886fn normalize_chart_like_markdown(markdown: &str) -> String {
6887 let blocks: Vec<&str> = markdown
6888 .split("\n\n")
6889 .map(str::trim)
6890 .filter(|block| !block.is_empty())
6891 .collect();
6892 if blocks.is_empty() {
6893 return markdown.trim().to_string();
6894 }
6895
6896 let mut normalized = Vec::new();
6897 let mut i = 0usize;
6898 while i < blocks.len() {
6899 if let Some(rendered) = trim_large_top_table_plate(&blocks, i) {
6900 normalized.push(rendered);
6901 break;
6902 }
6903
6904 if let Some((rendered, consumed)) = render_header_pair_chart_table(&blocks, i) {
6905 normalized.push(rendered);
6906 i += consumed;
6907 continue;
6908 }
6909
6910 if let Some((rendered, consumed)) = render_chart_block(&blocks, i) {
6911 normalized.push(rendered);
6912 i += consumed;
6913 continue;
6914 }
6915
6916 if let Some((rendered, consumed)) = render_structural_caption_block(&blocks, i) {
6917 normalized.push(rendered);
6918 i += consumed;
6919 continue;
6920 }
6921
6922 if should_drop_artifact_table_block(&blocks, i) {
6923 i += 1;
6924 continue;
6925 }
6926
6927 if !looks_like_footer_banner(blocks[i]) {
6928 normalized.push(blocks[i].to_string());
6929 }
6930 i += 1;
6931 }
6932
6933 normalized.join("\n\n").trim().to_string() + "\n"
6934}
6935
6936fn trim_large_top_table_plate(blocks: &[&str], start: usize) -> Option<String> {
6937 if start != 0 {
6938 return None;
6939 }
6940
6941 let rows = parse_pipe_table_block(blocks.first()?.trim())?;
6942 let body_rows = rows.len().saturating_sub(2);
6943 let max_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
6944 if body_rows < 8 || max_cols < 8 {
6945 return None;
6946 }
6947
6948 let caption = blocks.get(1)?.trim();
6949 if !caption.starts_with("Table ") || caption.split_whitespace().count() < 12 {
6950 return None;
6951 }
6952
6953 let has_following_section = blocks.iter().skip(2).any(|block| {
6954 let trimmed = block.trim();
6955 trimmed.starts_with("# ")
6956 || trimmed.starts_with("## ")
6957 || trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
6958 && trimmed.contains(" Main Results")
6959 });
6960 has_following_section.then_some(blocks[0].trim().to_string())
6961}
6962
6963fn render_header_pair_chart_table(blocks: &[&str], start: usize) -> Option<(String, usize)> {
6964 let caption = blocks.get(start)?.trim();
6965 if !is_structural_caption(caption) {
6966 return None;
6967 }
6968
6969 let rows = parse_pipe_table_block(blocks.get(start + 1)?)?;
6970 if rows.len() != 2 {
6971 return None;
6972 }
6973
6974 let pairs = extract_value_year_pairs_from_cells(&rows[0]);
6975 if pairs.len() < 4 {
6976 return None;
6977 }
6978
6979 let mut source = String::new();
6980 let mut consumed = 2usize;
6981 if let Some(next_block) = blocks.get(start + 2) {
6982 let next = next_block.trim();
6983 if next.to_ascii_lowercase().starts_with("source:") {
6984 source = next.to_string();
6985 consumed += 1;
6986 }
6987 }
6988
6989 let mut out = String::new();
6990 let heading_prefix = if start == 0 { "# " } else { "## " };
6991 out.push_str(heading_prefix);
6992 out.push_str(caption);
6993 out.push_str("\n\n");
6994 out.push_str(&format!("| Year | {} |\n", chart_value_header(caption)));
6995 out.push_str("| --- | --- |\n");
6996 for (year, value) in pairs {
6997 out.push_str(&format!("| {} | {} |\n", year, value));
6998 }
6999 out.push('\n');
7000
7001 if !source.is_empty() {
7002 out.push('*');
7003 out.push_str(&escape_md_line_start(&source));
7004 out.push_str("*\n\n");
7005 }
7006
7007 Some((out.trim().to_string(), consumed))
7008}
7009
7010fn render_chart_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
7011 let (caption, numeric_tokens) = split_chart_caption_and_values(blocks.get(start)?)?;
7012 let mut consumed = 1usize;
7013
7014 let mut source = String::new();
7015 let mut labels = Vec::new();
7016 if let Some(next_block) = blocks.get(start + 1) {
7017 let (candidate_labels, candidate_source) = extract_chart_labels_and_source(next_block);
7018 if !candidate_source.is_empty() || !candidate_labels.is_empty() {
7019 labels = candidate_labels;
7020 source = candidate_source;
7021 consumed += 1;
7022 }
7023 }
7024
7025 while let Some(block) = blocks.get(start + consumed) {
7026 if looks_like_numeric_noise_block(block) {
7027 consumed += 1;
7028 continue;
7029 }
7030 break;
7031 }
7032
7033 let value_tokens = derive_chart_series_values(&numeric_tokens, labels.len());
7034
7035 let mut out = String::new();
7036 out.push_str("## ");
7037 out.push_str(caption.trim());
7038 out.push_str("\n\n");
7039
7040 if labels.len() >= 3 && labels.len() == value_tokens.len() {
7041 let label_header = if labels.iter().all(|label| looks_like_yearish_label(label)) {
7042 "Year"
7043 } else {
7044 "Label"
7045 };
7046 let value_header = chart_value_header(&caption);
7047 out.push_str(&format!("| {} | {} |\n", label_header, value_header));
7048 out.push_str("| --- | --- |\n");
7049 for (label, value) in labels.iter().zip(value_tokens.iter()) {
7050 out.push_str(&format!("| {} | {} |\n", label, value));
7051 }
7052 out.push('\n');
7053 }
7054
7055 if !source.is_empty() {
7056 out.push('*');
7057 out.push_str(&escape_md_line_start(&source));
7058 out.push_str("*\n\n");
7059 }
7060
7061 Some((out.trim().to_string(), consumed))
7062}
7063
7064fn render_structural_caption_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
7065 let block = blocks.get(start)?.trim();
7066 if !is_structural_caption(block) || block.contains('|') {
7067 return None;
7068 }
7069
7070 let mut caption = collapse_inline_whitespace(block);
7071 let mut consumed = 1usize;
7072 if let Some(next_block) = blocks.get(start + 1) {
7073 let next = next_block.trim();
7074 if looks_like_caption_continuation(next) {
7075 caption.push(' ');
7076 caption.push_str(next.trim_end_matches('.'));
7077 consumed += 1;
7078 } else if !looks_like_isolated_caption_context(block, next) {
7079 return None;
7080 }
7081 } else {
7082 return None;
7083 }
7084
7085 Some((format!("## {}", caption.trim()), consumed))
7086}
7087
7088fn split_chart_caption_and_values(block: &str) -> Option<(String, Vec<String>)> {
7089 let trimmed = block.trim();
7090 if !is_structural_caption(trimmed) {
7091 return None;
7092 }
7093
7094 let tokens: Vec<&str> = trimmed.split_whitespace().collect();
7095 let first_numeric_idx = tokens.iter().position(|token| is_numberish_token(token))?;
7096 if first_numeric_idx < 3 {
7097 return None;
7098 }
7099
7100 let caption = tokens[..first_numeric_idx].join(" ");
7101 let numeric_tokens: Vec<String> = tokens[first_numeric_idx..]
7102 .iter()
7103 .filter_map(|token| sanitize_numberish_token(token))
7104 .collect();
7105
7106 if numeric_tokens.len() < 4 {
7107 return None;
7108 }
7109
7110 Some((caption, numeric_tokens))
7111}
7112
7113fn parse_pipe_table_block(block: &str) -> Option<Vec<Vec<String>>> {
7114 let lines: Vec<&str> = block
7115 .lines()
7116 .map(str::trim)
7117 .filter(|line| !line.is_empty())
7118 .collect();
7119 if lines.len() < 2 {
7120 return None;
7121 }
7122
7123 let header = split_pipe_row(lines[0])?;
7124 if !is_pipe_separator_row(lines[1], header.len()) {
7125 return None;
7126 }
7127
7128 let mut rows = vec![header];
7129 rows.push(split_pipe_row(lines[1]).unwrap_or_default());
7130 for line in lines.iter().skip(2) {
7131 let row = split_pipe_row(line)?;
7132 rows.push(row);
7133 }
7134 Some(rows)
7135}
7136
7137fn split_pipe_row(line: &str) -> Option<Vec<String>> {
7138 let trimmed = line.trim();
7139 if !trimmed.starts_with('|') || !trimmed.ends_with('|') {
7140 return None;
7141 }
7142
7143 Some(
7144 trimmed[1..trimmed.len() - 1]
7145 .split('|')
7146 .map(|cell| cell.trim().to_string())
7147 .collect(),
7148 )
7149}
7150
7151fn is_pipe_separator_row(line: &str, expected_cols: usize) -> bool {
7152 let Some(cells) = split_pipe_row(line) else {
7153 return false;
7154 };
7155 if cells.len() != expected_cols || expected_cols == 0 {
7156 return false;
7157 }
7158
7159 cells.iter().all(|cell| {
7160 let stripped = cell.trim_matches(':').trim();
7161 !stripped.is_empty() && stripped.chars().all(|ch| ch == '-')
7162 })
7163}
7164
7165fn extract_value_year_pairs_from_cells(cells: &[String]) -> Vec<(String, String)> {
7166 let mut pairs = Vec::new();
7167 for cell in cells {
7168 let tokens: Vec<&str> = cell.split_whitespace().collect();
7169 if tokens.len() != 2 {
7170 continue;
7171 }
7172
7173 if looks_like_year_token(tokens[0]) && is_numberish_token(tokens[1]) {
7174 if let Some(value) = sanitize_numberish_token(tokens[1]) {
7175 pairs.push((tokens[0].to_string(), value));
7176 }
7177 continue;
7178 }
7179
7180 if is_numberish_token(tokens[0]) && looks_like_year_token(tokens[1]) {
7181 if let Some(value) = sanitize_numberish_token(tokens[0]) {
7182 pairs.push((tokens[1].to_string(), value));
7183 }
7184 }
7185 }
7186
7187 pairs.sort_by(|left, right| left.0.cmp(&right.0));
7188 pairs
7189}
7190
7191fn should_drop_artifact_table_block(blocks: &[&str], start: usize) -> bool {
7192 let Some(rows) = parse_pipe_table_block(blocks[start]) else {
7193 return false;
7194 };
7195
7196 let prev = start
7197 .checked_sub(1)
7198 .and_then(|idx| blocks.get(idx))
7199 .map(|block| block.trim())
7200 .unwrap_or("");
7201 let next = blocks
7202 .get(start + 1)
7203 .map(|block| block.trim())
7204 .unwrap_or("");
7205
7206 if rows.len() == 2 && rows.first().is_some_and(|row| row.len() == 1) {
7207 let header = rows[0][0].trim();
7208 if looks_like_url_fragment(header) {
7209 return true;
7210 }
7211 if looks_like_numeric_axis_blob(header) && !previous_block_announces_table(prev) {
7212 return true;
7213 }
7214 }
7215
7216 let stats = pipe_table_stats(&rows);
7217 stats.fill_ratio < 0.5
7218 && stats.long_cell_count == 0
7219 && !is_structural_caption(prev)
7220 && (looks_like_citation_block(next) || is_structural_caption(next))
7221}
7222
7223fn previous_block_announces_table(block: &str) -> bool {
7224 let lower = block.trim().to_ascii_lowercase();
7225 lower.ends_with("as follows:")
7226 || lower.ends_with("following details:")
7227 || lower.ends_with("following detail:")
7228 || lower.contains("the following details")
7229}
7230
7231fn looks_like_url_fragment(text: &str) -> bool {
7232 let trimmed = text.trim();
7233 (!trimmed.is_empty() && (trimmed.contains("http") || trimmed.contains("/status/")))
7234 || (trimmed.contains('/') && !trimmed.contains(' '))
7235}
7236
7237fn looks_like_numeric_axis_blob(text: &str) -> bool {
7238 let numeric_values: Vec<i64> = text
7239 .split_whitespace()
7240 .filter_map(parse_integer_token)
7241 .collect();
7242 numeric_values.len() >= 8
7243 && !detect_axis_progression(&numeric_values).is_empty()
7244 && text.chars().any(char::is_alphabetic)
7245}
7246
7247fn looks_like_citation_block(block: &str) -> bool {
7248 let trimmed = block.trim();
7249 trimmed.starts_with('(') && trimmed.ends_with(')') && trimmed.split_whitespace().count() <= 8
7250}
7251
7252struct PipeTableStats {
7253 fill_ratio: f64,
7254 long_cell_count: usize,
7255}
7256
7257fn pipe_table_stats(rows: &[Vec<String>]) -> PipeTableStats {
7258 let cols = rows.iter().map(Vec::len).max().unwrap_or(0).max(1);
7259 let body = rows.len().saturating_sub(2);
7260 let mut nonempty = 0usize;
7261 let mut long_cell_count = 0usize;
7262
7263 for row in rows.iter().skip(2) {
7264 for cell in row {
7265 if !cell.trim().is_empty() {
7266 nonempty += 1;
7267 if cell.split_whitespace().count() >= 3 {
7268 long_cell_count += 1;
7269 }
7270 }
7271 }
7272 }
7273
7274 let fill_ratio = if body == 0 {
7275 0.0
7276 } else {
7277 nonempty as f64 / (body * cols) as f64
7278 };
7279
7280 PipeTableStats {
7281 fill_ratio,
7282 long_cell_count,
7283 }
7284}
7285
7286fn extract_chart_labels_and_source(block: &str) -> (Vec<String>, String) {
7287 let trimmed = block.trim();
7288 let lower = trimmed.to_ascii_lowercase();
7289 let source_idx = lower.find("source:");
7290
7291 let label_region = source_idx.map_or(trimmed, |idx| trimmed[..idx].trim());
7292 let source = source_idx
7293 .map(|idx| trimmed[idx..].trim().to_string())
7294 .unwrap_or_default();
7295
7296 let labels = parse_chart_labels(label_region);
7297 (labels, source)
7298}
7299
7300fn parse_chart_labels(text: &str) -> Vec<String> {
7301 let tokens: Vec<&str> = text.split_whitespace().collect();
7302 let mut labels = Vec::new();
7303 let mut i = 0usize;
7304 while i < tokens.len() {
7305 let token = tokens[i].trim_matches(|c: char| c == ',' || c == ';');
7306 if looks_like_year_token(token) {
7307 let mut label = token.to_string();
7308 if let Some(next) = tokens.get(i + 1) {
7309 let next_trimmed = next.trim_matches(|c: char| c == ',' || c == ';');
7310 if next_trimmed.starts_with('(') && next_trimmed.ends_with(')') {
7311 label.push(' ');
7312 label.push_str(next_trimmed);
7313 i += 1;
7314 }
7315 }
7316 labels.push(label);
7317 } else if looks_like_category_label(token) {
7318 labels.push(token.to_string());
7319 }
7320 i += 1;
7321 }
7322 labels
7323}
7324
7325fn derive_chart_series_values(tokens: &[String], expected_count: usize) -> Vec<String> {
7326 if expected_count == 0 {
7327 return Vec::new();
7328 }
7329
7330 if tokens.len() == expected_count {
7331 return tokens.to_vec();
7332 }
7333
7334 let numeric_values: Vec<i64> = tokens
7335 .iter()
7336 .filter_map(|token| parse_integer_token(token))
7337 .collect();
7338 if numeric_values.len() != tokens.len() {
7339 return Vec::new();
7340 }
7341
7342 let axis_series = detect_axis_progression(&numeric_values);
7343 if axis_series.is_empty() {
7344 return Vec::new();
7345 }
7346
7347 let mut remaining = Vec::new();
7348 let mut removable = axis_series;
7349 for token in tokens {
7350 let Some(value) = parse_integer_token(token) else {
7351 continue;
7352 };
7353 if let Some(pos) = removable.iter().position(|candidate| *candidate == value) {
7354 removable.remove(pos);
7355 } else {
7356 remaining.push(token.clone());
7357 }
7358 }
7359
7360 if remaining.len() == expected_count {
7361 remaining
7362 } else {
7363 Vec::new()
7364 }
7365}
7366
7367fn detect_axis_progression(values: &[i64]) -> Vec<i64> {
7368 if values.len() < 6 {
7369 return Vec::new();
7370 }
7371
7372 let mut sorted = values.to_vec();
7373 sorted.sort_unstable();
7374 sorted.dedup();
7375 if sorted.len() < 6 {
7376 return Vec::new();
7377 }
7378
7379 let mut best = Vec::new();
7380 for window in sorted.windows(2) {
7381 let step = window[1] - window[0];
7382 if step <= 0 {
7383 continue;
7384 }
7385
7386 let mut series = vec![window[0]];
7387 let mut current = window[0];
7388 loop {
7389 let next = current + step;
7390 if sorted.binary_search(&next).is_ok() {
7391 series.push(next);
7392 current = next;
7393 } else {
7394 break;
7395 }
7396 }
7397
7398 if series.len() > best.len() {
7399 best = series;
7400 }
7401 }
7402
7403 if best.len() >= 6 {
7404 best
7405 } else {
7406 Vec::new()
7407 }
7408}
7409
7410fn chart_value_header(caption: &str) -> String {
7411 let trimmed = caption.trim();
7412 let title = strip_structural_caption_prefix(trimmed);
7413
7414 let mut base = title.to_string();
7415 if let Some(idx) = base.rfind(" in ") {
7416 let tail = base[idx + 4..].trim();
7417 if tail.split_whitespace().count() <= 2
7418 && tail.chars().next().is_some_and(char::is_uppercase)
7419 {
7420 base.truncate(idx);
7421 }
7422 }
7423
7424 if let Some(start) = title.rfind('(') {
7425 if title.ends_with(')') {
7426 let unit = title[start + 1..title.len() - 1].trim();
7427 if let Some(idx) = base.rfind('(') {
7428 base.truncate(idx);
7429 }
7430 let normalized_unit = unit.strip_prefix("in ").unwrap_or(unit).trim();
7431 return format!("{} ({})", base.trim(), normalized_unit);
7432 }
7433 }
7434
7435 let trimmed = base.trim();
7436 if trimmed.is_empty() {
7437 "Value".to_string()
7438 } else {
7439 trimmed.to_string()
7440 }
7441}
7442
7443fn strip_structural_caption_prefix(text: &str) -> &str {
7444 let trimmed = text.trim();
7445 let mut parts = trimmed.splitn(3, ' ');
7446 let Some(first) = parts.next() else {
7447 return trimmed;
7448 };
7449 let Some(second) = parts.next() else {
7450 return trimmed;
7451 };
7452 let Some(rest) = parts.next() else {
7453 return trimmed;
7454 };
7455
7456 let first_lower = first.to_ascii_lowercase();
7457 if matches!(
7458 first_lower.as_str(),
7459 "figure" | "table" | "diagram" | "chart"
7460 ) && second
7461 .chars()
7462 .all(|ch| ch.is_ascii_digit() || matches!(ch, '.' | ':'))
7463 {
7464 rest.trim()
7465 } else {
7466 trimmed
7467 }
7468}
7469
7470fn looks_like_footer_banner(block: &str) -> bool {
7471 let trimmed = block.trim();
7472 if trimmed.contains('\n') || trimmed.len() < 8 {
7473 return false;
7474 }
7475
7476 let tokens: Vec<&str> = trimmed.split_whitespace().collect();
7477 if !(2..=6).contains(&tokens.len()) {
7478 return false;
7479 }
7480
7481 let Some(last) = tokens.last() else {
7482 return false;
7483 };
7484 if !last.chars().all(|ch| ch.is_ascii_digit()) {
7485 return false;
7486 }
7487
7488 tokens[..tokens.len() - 1].iter().all(|token| {
7489 matches!(
7490 token.to_ascii_lowercase().as_str(),
7491 "of" | "and" | "the" | "for" | "in" | "on"
7492 ) || token.chars().next().is_some_and(char::is_uppercase)
7493 })
7494}
7495
7496fn looks_like_caption_continuation(block: &str) -> bool {
7497 let trimmed = block.trim();
7498 !trimmed.is_empty()
7499 && trimmed.split_whitespace().count() <= 8
7500 && trimmed.chars().next().is_some_and(char::is_uppercase)
7501 && !trimmed.contains(':')
7502}
7503
7504fn collapse_inline_whitespace(text: &str) -> String {
7505 text.split_whitespace().collect::<Vec<_>>().join(" ")
7506}
7507
7508fn drop_isolated_noise_lines(markdown: &str) -> String {
7509 let lines: Vec<&str> = markdown.lines().collect();
7510 let mut kept = Vec::with_capacity(lines.len());
7511
7512 for (idx, line) in lines.iter().enumerate() {
7513 if should_drop_isolated_noise_line(&lines, idx) {
7514 continue;
7515 }
7516 kept.push(*line);
7517 }
7518
7519 let mut result = kept.join("\n");
7520 if markdown.ends_with('\n') {
7521 result.push('\n');
7522 }
7523 result
7524}
7525
7526fn should_drop_isolated_noise_line(lines: &[&str], idx: usize) -> bool {
7527 let trimmed = lines[idx].trim();
7528 if trimmed.len() != 1 {
7529 return false;
7530 }
7531
7532 let ch = trimmed.chars().next().unwrap_or_default();
7533 if !(ch.is_ascii_lowercase() || ch.is_ascii_digit()) {
7534 return false;
7535 }
7536
7537 let prev = previous_nonempty_line(lines, idx);
7538 let next = next_nonempty_line(lines, idx);
7539 let (Some(prev), Some(next)) = (prev, next) else {
7540 return false;
7541 };
7542
7543 is_substantive_markdown_line(prev) && is_substantive_markdown_line(next)
7544}
7545
7546fn previous_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
7547 lines[..idx]
7548 .iter()
7549 .rev()
7550 .find(|line| !line.trim().is_empty())
7551 .copied()
7552}
7553
7554fn next_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
7555 lines[idx + 1..]
7556 .iter()
7557 .find(|line| !line.trim().is_empty())
7558 .copied()
7559}
7560
7561fn is_substantive_markdown_line(line: &str) -> bool {
7562 let trimmed = line.trim();
7563 if trimmed.is_empty() {
7564 return false;
7565 }
7566
7567 if trimmed.starts_with('|') || trimmed.starts_with("- ") || trimmed.starts_with('#') {
7568 return true;
7569 }
7570
7571 trimmed.split_whitespace().count() >= 2
7572}
7573
7574fn normalize_common_ocr_text(text: &str) -> String {
7575 if text.is_empty() {
7576 return String::new();
7577 }
7578
7579 let mut normalized = text
7580 .replace("ߤL", "μL")
7581 .replace(" oC", "°C")
7582 .replace("37 C", "37°C")
7583 .replace("-20 oC", "-20°C")
7584 .replace("1- 20-μL", "1-20-μL")
7585 .replace("1- 20 μL", "1-20 μL")
7586 .replace("1- 2 0 μL", "1-20 μL")
7587 .replace("1- 2 0 μL", "1-20 μL");
7588
7589 normalized = normalize_degree_spacing(&normalized);
7590 collapse_inline_whitespace(&normalized)
7591}
7592
7593fn normalize_degree_spacing(text: &str) -> String {
7594 let chars: Vec<char> = text.chars().collect();
7595 let mut out = String::with_capacity(text.len());
7596 let mut i = 0usize;
7597 while i < chars.len() {
7598 let ch = chars[i];
7599 if ch == ' '
7600 && i > 0
7601 && i + 2 < chars.len()
7602 && chars[i - 1].is_ascii_digit()
7603 && matches!(chars[i + 1], 'C' | 'F')
7604 && !chars[i + 2].is_ascii_alphabetic()
7605 {
7606 out.push('°');
7607 out.push(chars[i + 1]);
7608 i += 2;
7609 continue;
7610 }
7611 out.push(ch);
7612 i += 1;
7613 }
7614 out
7615}
7616
7617fn normalize_list_text(text: &str) -> String {
7618 let normalized = normalize_common_ocr_text(text);
7619 let trimmed = normalized
7620 .trim_start_matches(|ch: char| is_bullet_like(ch))
7621 .trim();
7622 trimmed.to_string()
7623}
7624
7625fn push_rendered_list_item(out: &mut String, item: &str) {
7626 if starts_with_enumerated_marker(item) {
7627 out.push_str(item);
7628 out.push('\n');
7629 } else {
7630 out.push_str(&format!("- {}\n", item));
7631 }
7632}
7633
7634fn should_merge_list_continuation(previous: &str, current: &str) -> bool {
7635 let trimmed = current.trim();
7636 if trimmed.is_empty()
7637 || looks_like_stray_list_page_number(trimmed)
7638 || is_list_section_heading(trimmed)
7639 || looks_like_numbered_section(trimmed)
7640 || starts_with_enumerated_marker(trimmed)
7641 {
7642 return false;
7643 }
7644
7645 if previous.ends_with('-')
7646 && previous
7647 .chars()
7648 .rev()
7649 .nth(1)
7650 .is_some_and(|c| c.is_alphabetic())
7651 && trimmed.chars().next().is_some_and(char::is_lowercase)
7652 {
7653 return true;
7654 }
7655
7656 trimmed
7657 .chars()
7658 .next()
7659 .is_some_and(|ch| ch.is_ascii_lowercase() || matches!(ch, ',' | ';' | ')' | ']' | '%'))
7660}
7661
7662fn is_pure_bullet_marker(text: &str) -> bool {
7663 let trimmed = text.trim();
7664 !trimmed.is_empty() && trimmed.chars().all(is_bullet_like)
7665}
7666
7667fn looks_like_stray_list_page_number(text: &str) -> bool {
7668 let trimmed = text.trim();
7669 (1..=4).contains(&trimmed.len()) && trimmed.chars().all(|ch| ch.is_ascii_digit())
7670}
7671
7672fn is_bullet_like(ch: char) -> bool {
7673 matches!(
7674 ch,
7675 '•' | '◦'
7676 | '▪'
7677 | '▸'
7678 | '▹'
7679 | '►'
7680 | '▻'
7681 | '●'
7682 | '○'
7683 | '■'
7684 | '□'
7685 | '◆'
7686 | '◇'
7687 | '-'
7688 )
7689}
7690
7691fn looks_like_isolated_caption_context(caption: &str, next_block: &str) -> bool {
7692 let next = next_block.trim();
7693 if next.is_empty() {
7694 return false;
7695 }
7696
7697 let next_lower = next.to_ascii_lowercase();
7698 if next_lower.starts_with("source:")
7699 || next_lower.starts_with("note:")
7700 || next_lower.starts_with("*source:")
7701 || next_lower.starts_with("*note:")
7702 {
7703 return true;
7704 }
7705
7706 caption.split_whitespace().count() <= 14
7707 && next.split_whitespace().count() <= 45
7708 && (next.contains(':') || next.contains('='))
7709}
7710
7711fn looks_like_numeric_noise_block(block: &str) -> bool {
7712 let trimmed = block.trim();
7713 !trimmed.is_empty()
7714 && trimmed.split_whitespace().all(|token| {
7715 sanitize_numberish_token(token)
7716 .as_deref()
7717 .is_some_and(|sanitized| sanitized.chars().all(|ch| ch.is_ascii_digit()))
7718 })
7719}
7720
7721fn looks_like_yearish_label(label: &str) -> bool {
7722 label.chars().next().is_some_and(|ch| ch.is_ascii_digit())
7723}
7724
7725fn looks_like_year_token(token: &str) -> bool {
7726 token.len() == 4 && token.chars().all(|ch| ch.is_ascii_digit())
7727}
7728
7729fn looks_like_category_label(token: &str) -> bool {
7730 token
7731 .chars()
7732 .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '/' | '%'))
7733 && token.chars().any(|ch| ch.is_ascii_alphabetic())
7734}
7735
7736fn is_numberish_token(token: &str) -> bool {
7737 sanitize_numberish_token(token).is_some()
7738}
7739
7740fn sanitize_numberish_token(token: &str) -> Option<String> {
7741 let trimmed = token.trim_matches(|c: char| matches!(c, ',' | ';' | ':' | '.'));
7742 if trimmed.is_empty() {
7743 return None;
7744 }
7745
7746 let candidate = trimmed.trim_end_matches('%').replace(',', "");
7747 if candidate.chars().all(|ch| ch.is_ascii_digit()) {
7748 Some(trimmed.trim_end_matches([',', ';', ':']).to_string())
7749 } else {
7750 None
7751 }
7752}
7753
7754fn parse_integer_token(token: &str) -> Option<i64> {
7755 sanitize_numberish_token(token)?
7756 .replace(',', "")
7757 .parse::<i64>()
7758 .ok()
7759}
7760
7761fn starts_with_uppercase_word(text: &str) -> bool {
7762 for ch in text.trim_start().chars() {
7763 if ch.is_alphabetic() {
7764 return ch.is_uppercase();
7765 }
7766 if !matches!(ch, '"' | '\'' | '(' | '[') {
7767 break;
7768 }
7769 }
7770 false
7771}
7772
7773fn clean_paragraph_text(text: &str) -> String {
7776 let trimmed = text.trim();
7777 if trimmed.is_empty() {
7778 return String::new();
7779 }
7780 let mut result = String::with_capacity(trimmed.len());
7782 let mut prev_space = false;
7783 for ch in trimmed.chars() {
7784 if ch == ' ' || ch == '\t' {
7785 if !prev_space {
7786 result.push(' ');
7787 prev_space = true;
7788 }
7789 } else {
7790 result.push(ch);
7791 prev_space = false;
7792 }
7793 }
7794 normalize_common_ocr_text(&result)
7795}
7796
7797fn next_mergeable_paragraph_text(element: Option<&ContentElement>) -> Option<String> {
7798 match element {
7799 Some(ContentElement::Paragraph(p)) => {
7800 let text = clean_paragraph_text(&p.base.value());
7801 let trimmed = text.trim();
7802 if trimmed.is_empty()
7803 || should_render_element_as_heading(element.unwrap(), trimmed, None)
7804 {
7805 None
7806 } else {
7807 Some(trimmed.to_string())
7808 }
7809 }
7810 Some(ContentElement::TextBlock(tb)) => {
7811 let text = clean_paragraph_text(&tb.value());
7812 let trimmed = text.trim();
7813 if trimmed.is_empty()
7814 || should_render_element_as_heading(element.unwrap(), trimmed, None)
7815 {
7816 None
7817 } else {
7818 Some(trimmed.to_string())
7819 }
7820 }
7821 Some(ContentElement::TextLine(tl)) => {
7822 let text = clean_paragraph_text(&tl.value());
7823 let trimmed = text.trim();
7824 if trimmed.is_empty()
7825 || should_render_element_as_heading(element.unwrap(), trimmed, None)
7826 {
7827 None
7828 } else {
7829 Some(trimmed.to_string())
7830 }
7831 }
7832 _ => None,
7833 }
7834}
7835
7836fn should_render_paragraph_as_heading(
7837 doc: &PdfDocument,
7838 idx: usize,
7839 text: &str,
7840 next: Option<&ContentElement>,
7841) -> bool {
7842 if looks_like_top_margin_running_header(doc, idx, text) {
7843 return false;
7844 }
7845 if looks_like_hyphenated_table_title_continuation(doc, idx, text, next) {
7846 return true;
7847 }
7848 if should_render_element_as_heading(&doc.kids[idx], text, next) {
7849 return true;
7850 }
7851
7852 let body_font_size = compute_body_font_size(doc);
7855 if is_too_small_for_heading(&doc.kids, idx, body_font_size) {
7856 return false;
7857 }
7858
7859 if !doc_has_explicit_headings(doc) {
7861 if should_rescue_as_heading(doc, idx, text) {
7862 return true;
7863 }
7864 if should_rescue_allcaps_heading(doc, idx, text) {
7868 return true;
7869 }
7870 if should_rescue_numbered_heading(doc, idx, text) {
7871 return true;
7872 }
7873 return false;
7874 }
7875 if heading_density(doc) < 0.10 {
7878 if should_rescue_allcaps_heading(doc, idx, text) {
7879 return true;
7880 }
7881 if should_rescue_numbered_heading(doc, idx, text) {
7885 return true;
7886 }
7887 if body_font_size > 0.0 {
7892 if let ContentElement::Paragraph(p) = &doc.kids[idx] {
7893 if let Some(fs) = p.base.font_size {
7894 if fs >= 1.15 * body_font_size
7895 && is_heading_rescue_candidate(doc, idx, text)
7896 && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
7897 {
7898 return true;
7899 }
7900 }
7901 }
7902 }
7903 }
7904 false
7905}
7906
7907fn doc_has_explicit_headings(doc: &PdfDocument) -> bool {
7909 doc.kids.iter().any(|e| {
7910 matches!(
7911 e,
7912 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
7913 )
7914 })
7915}
7916
7917fn compute_body_font_size(doc: &PdfDocument) -> f64 {
7922 let mut font_sizes: Vec<f64> = doc
7923 .kids
7924 .iter()
7925 .filter_map(|e| {
7926 if let ContentElement::Paragraph(p) = e {
7927 let word_count = p.base.value().split_whitespace().count();
7928 if word_count > 10 {
7929 p.base.font_size
7930 } else {
7931 None
7932 }
7933 } else {
7934 None
7935 }
7936 })
7937 .collect();
7938 if font_sizes.is_empty() {
7939 return 0.0;
7940 }
7941 font_sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
7942 font_sizes[font_sizes.len() / 2]
7943}
7944
7945fn is_too_small_for_heading(doc_kids: &[ContentElement], idx: usize, body_font_size: f64) -> bool {
7950 if body_font_size <= 0.0 {
7951 return false;
7952 }
7953 if let ContentElement::Paragraph(p) = &doc_kids[idx] {
7954 if let Some(fs) = p.base.font_size {
7955 return fs < 0.95 * body_font_size;
7956 }
7957 }
7958 false
7959}
7960
7961fn heading_density(doc: &PdfDocument) -> f64 {
7963 let total = doc.kids.len();
7964 if total == 0 {
7965 return 0.0;
7966 }
7967 let heading_count = doc
7968 .kids
7969 .iter()
7970 .filter(|e| {
7971 matches!(
7972 e,
7973 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
7974 )
7975 })
7976 .count();
7977 heading_count as f64 / total as f64
7978}
7979
7980fn should_rescue_as_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
7983 is_heading_rescue_candidate(doc, idx, text)
7984 && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
7985}
7986
7987fn is_heading_rescue_candidate(doc: &PdfDocument, idx: usize, text: &str) -> bool {
7991 let trimmed = text.trim();
7992 if trimmed.is_empty() {
7993 return false;
7994 }
7995
7996 let has_alpha = trimmed.chars().any(char::is_alphabetic);
7997
7998 if !has_alpha || trimmed.ends_with(['.', '!', '?', ';', ',']) {
8000 return false;
8001 }
8002
8003 if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8005 return false;
8006 }
8007
8008 if trimmed.starts_with('(') && trimmed.ends_with(')') {
8010 return false;
8011 }
8012
8013 if starts_with_caption_prefix(trimmed)
8015 || looks_like_chart_label_heading(&doc.kids[idx], trimmed)
8016 {
8017 return false;
8018 }
8019
8020 let word_count = trimmed.split_whitespace().count();
8022 if word_count > 6 || trimmed.len() > 60 {
8023 return false;
8024 }
8025
8026 if trimmed
8028 .chars()
8029 .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
8030 {
8031 return false;
8032 }
8033
8034 if let Some(first_alpha) = trimmed.chars().find(|c| c.is_alphabetic()) {
8036 if first_alpha.is_lowercase() {
8037 return false;
8038 }
8039 }
8040
8041 true
8042}
8043
8044fn has_substantive_follow_up(
8048 doc: &PdfDocument,
8049 idx: usize,
8050 word_count: usize,
8051 max_lookahead: usize,
8052) -> bool {
8053 for offset in 1..=max_lookahead {
8054 let lookahead_idx = idx + offset;
8055 if lookahead_idx >= doc.kids.len() {
8056 break;
8057 }
8058 let look_elem = &doc.kids[lookahead_idx];
8059 match look_elem {
8060 ContentElement::Paragraph(p) => {
8061 let next_text = p.base.value();
8062 let nw = next_text.split_whitespace().count();
8063 if nw >= word_count * 3 || nw > 15 {
8064 return true;
8065 }
8066 }
8067 ContentElement::TextBlock(tb) => {
8068 let next_text = tb.value();
8069 let nw = next_text.split_whitespace().count();
8070 if nw >= word_count * 3 || nw > 15 {
8071 return true;
8072 }
8073 }
8074 ContentElement::TextLine(tl) => {
8075 let next_text = tl.value();
8076 let nw = next_text.split_whitespace().count();
8077 if nw >= word_count * 3 || nw > 15 {
8078 return true;
8079 }
8080 }
8081 ContentElement::List(_)
8082 | ContentElement::Table(_)
8083 | ContentElement::TableBorder(_)
8084 | ContentElement::Image(_)
8085 | ContentElement::Figure(_) => {
8086 return true;
8087 }
8088 _ => continue,
8089 }
8090 }
8091
8092 false
8093}
8094
8095fn should_rescue_numbered_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8098 let trimmed = text.trim();
8099 if trimmed.is_empty() || trimmed.len() > 100 {
8100 return false;
8101 }
8102
8103 if !looks_like_numbered_section(trimmed) {
8106 return false;
8107 }
8108
8109 if trimmed.ends_with(['!', '?', ';', ',']) {
8113 return false;
8114 }
8115 if trimmed.ends_with('.') && !looks_like_keyword_numbered_section(trimmed) {
8116 return false;
8117 }
8118 if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8120 return false;
8121 }
8122
8123 for offset in 1..=3 {
8125 let lookahead_idx = idx + offset;
8126 if lookahead_idx >= doc.kids.len() {
8127 break;
8128 }
8129 match &doc.kids[lookahead_idx] {
8130 ContentElement::Paragraph(p) => {
8131 let nw = p.base.value().split_whitespace().count();
8132 if nw > 10 {
8133 return true;
8134 }
8135 }
8136 ContentElement::TextBlock(tb) => {
8137 let nw = tb.value().split_whitespace().count();
8138 if nw > 10 {
8139 return true;
8140 }
8141 }
8142 ContentElement::TextLine(tl) => {
8143 let nw = tl.value().split_whitespace().count();
8144 if nw > 10 {
8145 return true;
8146 }
8147 }
8148 ContentElement::List(_)
8149 | ContentElement::Table(_)
8150 | ContentElement::TableBorder(_)
8151 | ContentElement::Image(_)
8152 | ContentElement::Figure(_) => {
8153 return true;
8154 }
8155 _ => continue,
8156 }
8157 }
8158
8159 false
8160}
8161
8162fn looks_like_numbered_section(text: &str) -> bool {
8165 let bytes = text.as_bytes();
8166 if bytes.is_empty() {
8167 return false;
8168 }
8169
8170 let mut idx = 0;
8172 if bytes[0].is_ascii_digit() {
8173 while idx < bytes.len() && bytes[idx].is_ascii_digit() {
8174 idx += 1;
8175 }
8176 if idx >= bytes.len() {
8177 return false;
8178 }
8179 while idx < bytes.len() && bytes[idx] == b'.' {
8181 idx += 1;
8182 let start = idx;
8183 while idx < bytes.len() && bytes[idx].is_ascii_digit() {
8184 idx += 1;
8185 }
8186 if idx == start {
8187 break;
8189 }
8190 }
8191 if idx >= bytes.len() {
8193 return false;
8194 }
8195 if bytes[idx] == b' ' || bytes[idx] == b'\t' {
8197 idx += 1;
8198 if idx < bytes.len() && bytes[idx] == b'-' {
8200 idx += 1;
8201 if idx < bytes.len() && bytes[idx] == b' ' {
8202 idx += 1;
8203 }
8204 }
8205 } else if bytes[idx] == b'-' {
8206 idx += 1;
8207 if idx < bytes.len() && bytes[idx] == b' ' {
8208 idx += 1;
8209 }
8210 } else {
8211 return false;
8212 }
8213 let rest = &text[idx..].trim();
8215 if rest.is_empty() {
8216 return false;
8217 }
8218 if let Some(c) = rest.chars().find(|c| c.is_alphabetic()) {
8220 return c.is_uppercase();
8221 }
8222 return false;
8223 }
8224
8225 if looks_like_keyword_numbered_section(text) {
8227 return true;
8228 }
8229
8230 false
8231}
8232
8233const SECTION_KEYWORDS: &[&str] = &[
8235 "activity",
8236 "appendix",
8237 "case",
8238 "chapter",
8239 "exercise",
8240 "experiment",
8241 "lab",
8242 "lesson",
8243 "module",
8244 "part",
8245 "phase",
8246 "problem",
8247 "question",
8248 "section",
8249 "stage",
8250 "step",
8251 "task",
8252 "topic",
8253 "unit",
8254];
8255
8256fn looks_like_keyword_numbered_section(text: &str) -> bool {
8258 let trimmed = text.trim();
8259 let space_pos = match trimmed.find(' ') {
8261 Some(p) => p,
8262 None => return false,
8263 };
8264 let keyword = &trimmed[..space_pos];
8265 if !SECTION_KEYWORDS
8266 .iter()
8267 .any(|k| keyword.eq_ignore_ascii_case(k))
8268 {
8269 return false;
8270 }
8271 let rest = trimmed[space_pos + 1..].trim_start();
8273 if rest.is_empty() {
8274 return false;
8275 }
8276 let rest = rest.strip_prefix('#').unwrap_or(rest);
8277 let first_char = rest.chars().next().unwrap_or(' ');
8279 if !first_char.is_ascii_digit() && !matches!(first_char, 'I' | 'V' | 'X' | 'L') {
8280 return false;
8281 }
8282 true
8283}
8284
8285fn should_rescue_allcaps_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8288 let trimmed = text.trim();
8289 if trimmed.is_empty() {
8290 return false;
8291 }
8292
8293 let word_count = trimmed.split_whitespace().count();
8294
8295 if word_count > 8 || trimmed.len() > 80 {
8297 return false;
8298 }
8299
8300 let alpha_chars: Vec<char> = trimmed.chars().filter(|c| c.is_alphabetic()).collect();
8302 if alpha_chars.len() < 2 || !alpha_chars.iter().all(|c| c.is_uppercase()) {
8303 return false;
8304 }
8305
8306 if trimmed.ends_with(['.', ';', ',']) {
8308 return false;
8309 }
8310
8311 if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8313 return false;
8314 }
8315
8316 if starts_with_caption_prefix(trimmed) {
8318 return false;
8319 }
8320
8321 if trimmed
8323 .chars()
8324 .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
8325 {
8326 return false;
8327 }
8328
8329 for offset in 1..=4 {
8332 let lookahead_idx = idx + offset;
8333 if lookahead_idx >= doc.kids.len() {
8334 break;
8335 }
8336 let look_elem = &doc.kids[lookahead_idx];
8337 match look_elem {
8338 ContentElement::Paragraph(p) => {
8339 let nw = p.base.value().split_whitespace().count();
8340 if nw > 6 {
8341 return true;
8342 }
8343 }
8344 ContentElement::TextBlock(tb) => {
8345 let nw = tb.value().split_whitespace().count();
8346 if nw > 6 {
8347 return true;
8348 }
8349 }
8350 ContentElement::TextLine(tl) => {
8351 let nw = tl.value().split_whitespace().count();
8352 if nw > 6 {
8353 return true;
8354 }
8355 }
8356 ContentElement::List(_)
8357 | ContentElement::Table(_)
8358 | ContentElement::TableBorder(_)
8359 | ContentElement::Image(_)
8360 | ContentElement::Figure(_) => {
8361 return true;
8362 }
8363 _ => continue,
8364 }
8365 }
8366
8367 false
8368}
8369
8370fn should_render_element_as_heading(
8371 element: &ContentElement,
8372 text: &str,
8373 next: Option<&ContentElement>,
8374) -> bool {
8375 let trimmed = text.trim();
8376 if trimmed.is_empty() {
8377 return false;
8378 }
8379
8380 let lower = trimmed.to_ascii_lowercase();
8381 if matches!(lower.as_str(), "contents" | "table of contents")
8382 && trimmed.starts_with(|c: char| c.is_uppercase())
8383 {
8384 return true;
8385 }
8386
8387 let word_count = trimmed.split_whitespace().count();
8388 let has_alpha = trimmed.chars().any(char::is_alphabetic);
8389 let title_like = has_alpha
8390 && word_count <= 4
8391 && trimmed.len() <= 40
8392 && !trimmed.ends_with(['.', '!', '?', ';', ':']);
8393
8394 let is_attribution = {
8398 let lower = trimmed.to_ascii_lowercase();
8399 lower.starts_with("source:")
8400 || lower.starts_with("credit:")
8401 || lower.starts_with("photo by ")
8402 || lower.starts_with("photo credit")
8403 || lower.starts_with("image by ")
8404 || lower.starts_with("image credit")
8405 };
8406
8407 title_like
8408 && matches!(next, Some(ContentElement::List(_)))
8409 && !looks_like_chart_label_heading(element, trimmed)
8410 && !is_attribution
8411}
8412
8413fn looks_like_hyphenated_table_title_continuation(
8414 doc: &PdfDocument,
8415 idx: usize,
8416 text: &str,
8417 next: Option<&ContentElement>,
8418) -> bool {
8419 if !matches!(
8420 next,
8421 Some(ContentElement::Table(_)) | Some(ContentElement::TableBorder(_))
8422 ) {
8423 return false;
8424 }
8425
8426 let trimmed = text.trim();
8427 if trimmed.is_empty()
8428 || starts_with_caption_prefix(trimmed)
8429 || looks_like_numbered_section(trimmed)
8430 || looks_like_keyword_numbered_section(trimmed)
8431 || !trimmed.ends_with(':')
8432 {
8433 return false;
8434 }
8435
8436 let word_count = trimmed.split_whitespace().count();
8437 if !(3..=5).contains(&word_count) || trimmed.len() > 60 {
8438 return false;
8439 }
8440
8441 let Some(first_alpha) = trimmed.chars().find(|ch| ch.is_alphabetic()) else {
8442 return false;
8443 };
8444 if first_alpha.is_lowercase() {
8445 return false;
8446 }
8447
8448 let Some(prev_idx) = idx.checked_sub(1) else {
8449 return false;
8450 };
8451 let prev_text = extract_element_text(&doc.kids[prev_idx]);
8452 let prev_trimmed = prev_text.trim();
8453 !prev_trimmed.is_empty() && prev_trimmed.ends_with('-')
8454}
8455
8456fn looks_like_table_header_duplicate_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8457 let trimmed = text.trim();
8458 if trimmed.is_empty()
8459 || starts_with_caption_prefix(trimmed)
8460 || looks_like_numbered_section(trimmed)
8461 || looks_like_keyword_numbered_section(trimmed)
8462 {
8463 return false;
8464 }
8465
8466 let word_count = trimmed.split_whitespace().count();
8467 if !(3..=10).contains(&word_count) || trimmed.len() > 96 {
8468 return false;
8469 }
8470
8471 let Some(prev_idx) = idx.checked_sub(1) else {
8472 return false;
8473 };
8474 let Some(previous_table) = table_border_from_element(&doc.kids[prev_idx]) else {
8475 return false;
8476 };
8477 if previous_table.num_columns < 3 || previous_table.rows.len() < 3 {
8478 return false;
8479 }
8480
8481 let mut rendered_rows = collect_table_border_rows(previous_table);
8482 if rendered_rows.is_empty() {
8483 return false;
8484 }
8485 merge_continuation_rows(&mut rendered_rows);
8486 trim_leading_table_carryover_rows(&mut rendered_rows);
8487
8488 let Some(header_row) = rendered_rows.first() else {
8489 return false;
8490 };
8491 let header_text = header_row
8492 .iter()
8493 .map(|cell| cell.trim())
8494 .filter(|cell| !cell.is_empty())
8495 .collect::<Vec<_>>()
8496 .join(" ");
8497 if !equivalent_heading_text(trimmed, &header_text) {
8498 return false;
8499 }
8500
8501 let page_number = doc.kids[idx].page_number();
8502 let mut short_fragments = 0usize;
8503 let mut numeric_fragments = 0usize;
8504
8505 for candidate in doc.kids.iter().skip(idx + 1) {
8506 if candidate.page_number() != page_number {
8507 break;
8508 }
8509 if matches!(
8510 candidate,
8511 ContentElement::Table(_) | ContentElement::TableBorder(_)
8512 ) {
8513 break;
8514 }
8515
8516 let fragment = extract_element_text(candidate);
8517 let fragment_trimmed = fragment.trim();
8518 if fragment_trimmed.is_empty()
8519 || looks_like_margin_page_number(doc, candidate, fragment_trimmed)
8520 {
8521 continue;
8522 }
8523
8524 let fragment_words = fragment_trimmed.split_whitespace().count();
8525 if fragment_words > 6 {
8526 return false;
8527 }
8528
8529 short_fragments += 1;
8530 if fragment_trimmed.chars().any(|ch| ch.is_ascii_digit()) {
8531 numeric_fragments += 1;
8532 }
8533
8534 if short_fragments >= 3 {
8535 break;
8536 }
8537 }
8538
8539 short_fragments >= 2 && numeric_fragments >= 1
8540}
8541
8542fn looks_like_top_margin_running_header(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8543 let trimmed = text.trim();
8544 if trimmed.is_empty() || trimmed.split_whitespace().count() > 6 {
8545 return false;
8546 }
8547
8548 let element = &doc.kids[idx];
8549 let bbox = element.bbox();
8550 if bbox.height() > 24.0 {
8551 return false;
8552 }
8553
8554 let Some(page) = element.page_number() else {
8555 return false;
8556 };
8557
8558 let mut page_tops = std::collections::HashMap::<u32, f64>::new();
8560 for candidate in &doc.kids {
8561 if let Some(p) = candidate.page_number() {
8562 let top = page_tops.entry(p).or_insert(f64::MIN);
8563 *top = top.max(candidate.bbox().top_y);
8564 }
8565 }
8566
8567 let page_top = page_tops.get(&page).copied().unwrap_or(0.0);
8568 if bbox.top_y < page_top - 24.0 {
8569 return false;
8570 }
8571
8572 let trimmed_lower = trimmed.to_lowercase();
8576 for other_elem in &doc.kids {
8577 let Some(other_page) = other_elem.page_number() else {
8578 continue;
8579 };
8580 if other_page == page {
8581 continue;
8582 }
8583 let other_bbox = other_elem.bbox();
8584 if other_bbox.height() > 24.0 {
8585 continue;
8586 }
8587 let other_top = page_tops.get(&other_page).copied().unwrap_or(0.0);
8588 if other_bbox.top_y < other_top - 24.0 {
8589 continue;
8590 }
8591 let other_text = match other_elem {
8592 ContentElement::Paragraph(p) => p.base.value(),
8593 ContentElement::TextBlock(tb) => tb.value(),
8594 ContentElement::TextLine(tl) => tl.value(),
8595 ContentElement::Heading(h) => h.base.base.value(),
8596 _ => continue,
8597 };
8598 if other_text.trim().to_lowercase() == trimmed_lower {
8599 return true;
8600 }
8601 }
8602
8603 false
8604}
8605
8606fn looks_like_chart_label_heading(element: &ContentElement, text: &str) -> bool {
8607 let trimmed = text.trim();
8608 let upper_words = trimmed
8609 .split_whitespace()
8610 .filter(|word| word.chars().any(char::is_alphabetic))
8611 .all(|word| {
8612 word.chars()
8613 .filter(|ch| ch.is_alphabetic())
8614 .all(|ch| ch.is_uppercase())
8615 });
8616
8617 (trimmed.contains('%') || upper_words) && element.bbox().height() <= 40.0
8618}
8619
8620fn should_demote_heading_to_paragraph(text: &str, next: &str) -> bool {
8621 let next_trimmed = next.trim();
8622 if !next_trimmed.chars().next().is_some_and(char::is_lowercase) {
8623 return false;
8624 }
8625
8626 let normalized = normalize_heading_text(text);
8627 if matches!(
8628 normalized.as_str(),
8629 "contents" | "tableofcontents" | "introduction" | "conclusion"
8630 ) {
8631 return false;
8632 }
8633
8634 let words: Vec<&str> = text.split_whitespace().collect();
8635 if words.len() < 3 {
8636 return false;
8637 }
8638
8639 words
8640 .last()
8641 .is_some_and(|word| is_sentence_fragment_tail(word))
8642}
8643
8644fn is_sentence_fragment_tail(word: &str) -> bool {
8645 matches!(
8646 word.trim_matches(|c: char| !c.is_alphanumeric())
8647 .to_ascii_lowercase()
8648 .as_str(),
8649 "a" | "an"
8650 | "and"
8651 | "as"
8652 | "at"
8653 | "by"
8654 | "for"
8655 | "from"
8656 | "in"
8657 | "into"
8658 | "of"
8659 | "on"
8660 | "or"
8661 | "that"
8662 | "the"
8663 | "to"
8664 | "with"
8665 )
8666}
8667
8668fn is_list_section_heading(text: &str) -> bool {
8669 let trimmed = text.trim();
8670 trimmed.ends_with(':')
8671 && trimmed.len() <= 80
8672 && trimmed.split_whitespace().count() <= 8
8673 && trimmed.chars().any(char::is_alphabetic)
8674 && !trimmed.chars().next().is_some_and(|c| c.is_ascii_digit())
8675 && !trimmed.starts_with(|c: char| "•‣◦●○◆◇▪▫–—-".contains(c))
8676}
8677
8678fn should_merge_paragraph_text(prev: &str, next: &str) -> bool {
8679 let next_trimmed = next.trim();
8680 if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
8681 return false;
8682 }
8683
8684 if starts_with_enumerated_marker(next_trimmed) {
8685 return false;
8686 }
8687
8688 if prev.ends_with('-')
8689 && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
8690 && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8691 {
8692 return true;
8693 }
8694
8695 if next_trimmed.chars().next().is_some_and(char::is_lowercase) {
8696 return true;
8697 }
8698
8699 let lower = next_trimmed.to_ascii_lowercase();
8700 if lower.starts_with("http://")
8701 || lower.starts_with("https://")
8702 || lower.starts_with("arxiv")
8703 || lower.starts_with("doi:")
8704 {
8705 return true;
8706 }
8707
8708 if matches!(
8709 next_trimmed.split_whitespace().next(),
8710 Some("In" | "Proceedings" | "Advances" | "Learning")
8711 ) {
8712 return true;
8713 }
8714
8715 !prev.ends_with(['.', '!', '?', ':'])
8716}
8717
8718fn should_merge_adjacent_semantic_paragraphs(prev: &str, next: &str) -> bool {
8719 let next_trimmed = next.trim();
8720 if next_trimmed.is_empty() {
8721 return false;
8722 }
8723
8724 if starts_with_enumerated_marker(next_trimmed) {
8725 return false;
8726 }
8727
8728 if prev.ends_with('-')
8729 && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
8730 && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8731 {
8732 return true;
8733 }
8734
8735 next_trimmed.chars().next().is_some_and(char::is_lowercase)
8736}
8737
8738fn starts_with_enumerated_marker(text: &str) -> bool {
8739 let first_token = match text.split_whitespace().next() {
8740 Some(token) => token.trim_start_matches(['(', '[']),
8741 None => return false,
8742 };
8743 if !first_token.ends_with(['.', ')', ':']) {
8744 return false;
8745 }
8746
8747 let marker = first_token.trim_end_matches(['.', ')', ':']);
8748 if marker.is_empty() {
8749 return false;
8750 }
8751
8752 if marker.chars().all(|c| c.is_ascii_digit()) {
8753 return true;
8754 }
8755
8756 if marker.len() == 1 && marker.chars().all(|c| c.is_ascii_alphabetic()) {
8757 return true;
8758 }
8759
8760 let lower = marker.to_ascii_lowercase();
8761 lower.len() <= 8 && lower.chars().all(|c| "ivxlcdm".contains(c))
8762}
8763
8764fn should_skip_leading_figure_carryover(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8765 let trimmed = text.trim();
8766 if !trimmed.starts_with("Figure ") || trimmed.split_whitespace().count() < 4 {
8767 return false;
8768 }
8769
8770 let element = &doc.kids[idx];
8771 let Some(page) = element.page_number() else {
8772 return false;
8773 };
8774
8775 let mut page_top = f64::MIN;
8776 for candidate in &doc.kids {
8777 if candidate.page_number() == Some(page)
8778 && matches!(
8779 candidate,
8780 ContentElement::Paragraph(_)
8781 | ContentElement::TextBlock(_)
8782 | ContentElement::TextLine(_)
8783 | ContentElement::Heading(_)
8784 | ContentElement::NumberHeading(_)
8785 | ContentElement::Caption(_)
8786 )
8787 {
8788 page_top = page_top.max(candidate.bbox().top_y);
8789 }
8790 }
8791 if !page_top.is_finite() || element.bbox().top_y < page_top - 72.0 {
8792 return false;
8793 }
8794
8795 for prior_idx in 0..idx {
8796 let prior = &doc.kids[prior_idx];
8797 let prior_text = extract_element_text(prior);
8798 let prior_trimmed = prior_text.trim();
8799 if prior_trimmed.is_empty()
8800 || is_standalone_page_number(prior_trimmed)
8801 || looks_like_footer_banner(prior_trimmed)
8802 {
8803 continue;
8804 }
8805 match prior {
8806 ContentElement::Paragraph(_)
8807 | ContentElement::TextBlock(_)
8808 | ContentElement::TextLine(_) => {
8809 if !starts_with_caption_prefix(prior_trimmed)
8810 && !looks_like_top_margin_running_header(doc, prior_idx, prior_trimmed)
8811 {
8812 return false;
8813 }
8814 }
8815 ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
8816 if !should_skip_heading_text(prior_trimmed) {
8817 return false;
8818 }
8819 }
8820 _ => return false,
8821 }
8822 }
8823
8824 for lookahead_idx in idx + 1..doc.kids.len().min(idx + 8) {
8825 let next = &doc.kids[lookahead_idx];
8826 if next.page_number() != Some(page) {
8827 break;
8828 }
8829 let next_text = extract_element_text(next);
8830 let next_trimmed = next_text.trim();
8831 if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
8832 continue;
8833 }
8834
8835 let is_numbered_heading = match next {
8836 ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
8837 looks_like_numbered_section(next_trimmed)
8838 || looks_like_keyword_numbered_section(next_trimmed)
8839 }
8840 ContentElement::Paragraph(_)
8841 | ContentElement::TextBlock(_)
8842 | ContentElement::TextLine(_) => {
8843 should_render_paragraph_as_heading(
8844 doc,
8845 lookahead_idx,
8846 next_trimmed,
8847 doc.kids.get(lookahead_idx + 1),
8848 ) && (looks_like_numbered_section(next_trimmed)
8849 || looks_like_keyword_numbered_section(next_trimmed))
8850 }
8851 _ => false,
8852 };
8853
8854 if is_numbered_heading {
8855 return true;
8856 }
8857
8858 if !starts_with_caption_prefix(next_trimmed) && next_trimmed.split_whitespace().count() >= 5
8859 {
8860 return false;
8861 }
8862 }
8863
8864 false
8865}
8866
8867fn merge_paragraph_text(target: &mut String, next: &str) {
8868 let next_trimmed = next.trim();
8869 if target.ends_with('-')
8870 && target
8871 .chars()
8872 .rev()
8873 .nth(1)
8874 .is_some_and(|c| c.is_alphabetic())
8875 && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8876 {
8877 target.pop();
8878 target.push_str(next_trimmed);
8879 } else {
8880 if !target.ends_with(' ') {
8881 target.push(' ');
8882 }
8883 target.push_str(next_trimmed);
8884 }
8885}
8886
8887fn is_standalone_page_number(text: &str) -> bool {
8888 let trimmed = text.trim();
8889 !trimmed.is_empty() && trimmed.len() <= 4 && trimmed.chars().all(|c| c.is_ascii_digit())
8890}
8891
8892fn looks_like_margin_page_number(doc: &PdfDocument, element: &ContentElement, text: &str) -> bool {
8893 if !is_standalone_page_number(text) {
8894 return false;
8895 }
8896
8897 let bbox = element.bbox();
8898 if bbox.height() > 24.0 {
8899 return false;
8900 }
8901
8902 let Some(page) = element.page_number() else {
8903 return false;
8904 };
8905
8906 let mut page_top = f64::MIN;
8907 let mut page_bottom = f64::MAX;
8908 for candidate in &doc.kids {
8909 if candidate.page_number() == Some(page) {
8910 let candidate_bbox = candidate.bbox();
8911 page_top = page_top.max(candidate_bbox.top_y);
8912 page_bottom = page_bottom.min(candidate_bbox.bottom_y);
8913 }
8914 }
8915
8916 if !page_top.is_finite() || !page_bottom.is_finite() {
8917 return false;
8918 }
8919
8920 bbox.top_y >= page_top - 24.0 || bbox.bottom_y <= page_bottom + 24.0
8921}
8922
8923fn looks_like_bottom_margin_heading(doc: &PdfDocument, idx: usize) -> bool {
8928 let element = &doc.kids[idx];
8929 let bbox = element.bbox();
8930 if bbox.height() > 30.0 {
8931 return false;
8932 }
8933
8934 let Some(page) = element.page_number() else {
8935 return false;
8936 };
8937
8938 let mut page_bottom = f64::MAX;
8939 for candidate in &doc.kids {
8940 if candidate.page_number() == Some(page) {
8941 page_bottom = page_bottom.min(candidate.bbox().bottom_y);
8942 }
8943 }
8944
8945 if !page_bottom.is_finite() {
8946 return false;
8947 }
8948
8949 bbox.bottom_y <= page_bottom + 24.0
8951}
8952
8953fn should_demote_period_heading(text: &str) -> bool {
8957 let trimmed = text.trim();
8958 if !trimmed.ends_with('.') {
8959 return false;
8960 }
8961 if looks_like_numbered_section(trimmed) || looks_like_keyword_numbered_section(trimmed) {
8964 return false;
8965 }
8966 let without_dot = trimmed.trim_end_matches('.');
8970 let word_count = without_dot.split_whitespace().count();
8971 if word_count <= 2 {
8974 return true;
8975 }
8976 false
8977}
8978
8979fn should_demote_comma_heading(text: &str) -> bool {
8982 text.trim().ends_with(',')
8983}
8984
8985fn should_demote_math_heading(text: &str) -> bool {
8988 text.chars().any(|c| {
8989 matches!(
8990 c,
8991 '¼' | '½'
8992 | '¾'
8993 | '≪'
8994 | '≫'
8995 | 'þ'
8996 | 'ð'
8997 | '∑'
8998 | '∫'
8999 | '∂'
9000 | '∏'
9001 | '√'
9002 | '∞'
9003 | '≈'
9004 | '÷'
9005 )
9006 })
9007}
9008
9009fn should_demote_percentage_heading(text: &str) -> bool {
9012 text.contains('%')
9013}
9014
9015fn should_demote_bibliography_heading(text: &str) -> bool {
9018 let t = text.trim();
9019 if t.len() < 6 {
9020 return false;
9021 }
9022 let bytes = t.as_bytes();
9023 bytes[0..4].iter().all(|b| b.is_ascii_digit())
9024 && bytes[4] == b'.'
9025 && (bytes[5] == b' ' || t.len() == 5)
9026}
9027
9028fn strip_trailing_page_number(text: &str) -> &str {
9033 let trimmed = text.trim();
9034 if let Some(last_space) = trimmed.rfind(' ') {
9035 let suffix = &trimmed[last_space + 1..];
9036 if !suffix.is_empty()
9037 && suffix.len() <= 4
9038 && suffix.chars().all(|c| c.is_ascii_digit())
9039 && trimmed[..last_space].split_whitespace().count() >= 3
9040 {
9041 return trimmed[..last_space].trim();
9042 }
9043 }
9044 trimmed
9045}
9046
9047fn find_merged_subsection_split(text: &str) -> Option<usize> {
9052 let bytes = text.as_bytes();
9055 let mut i = 3;
9057 while i < bytes.len() {
9058 if bytes[i - 1] == b' ' {
9059 if bytes[i].is_ascii_digit() {
9061 if let Some(dot_pos) = text[i..].find('.') {
9062 let after_dot = i + dot_pos + 1;
9063 if after_dot < bytes.len() && bytes[after_dot].is_ascii_digit() {
9064 return Some(i);
9066 }
9067 }
9068 }
9069 if bytes[i].is_ascii_uppercase()
9071 && i + 2 < bytes.len()
9072 && bytes[i + 1] == b'.'
9073 && bytes[i + 2].is_ascii_digit()
9074 {
9075 return Some(i);
9076 }
9077 }
9078 i += 1;
9079 }
9080 None
9081}
9082
9083fn should_skip_heading_text(text: &str) -> bool {
9084 let trimmed = text.trim();
9085 if trimmed.is_empty() || is_standalone_page_number(trimmed) {
9086 return true;
9087 }
9088
9089 let lower = trimmed.to_ascii_lowercase();
9090 if (lower.starts_with("chapter ") || lower.chars().next().is_some_and(|c| c.is_ascii_digit()))
9091 && trimmed.contains('|')
9092 {
9093 return true;
9094 }
9095
9096 let alpha_count = trimmed.chars().filter(|c| c.is_alphabetic()).count();
9097 let alnum_count = trimmed.chars().filter(|c| c.is_alphanumeric()).count();
9098 alpha_count == 0 || (alnum_count > 0 && alpha_count * 3 < alnum_count && !trimmed.contains(':'))
9099}
9100
9101fn repair_fragmented_words(text: &str) -> String {
9102 const STOPWORDS: &[&str] = &[
9103 "a", "an", "and", "are", "as", "at", "be", "by", "can", "for", "from", "if", "in", "into",
9104 "is", "it", "may", "must", "not", "of", "on", "or", "per", "that", "the", "to", "with",
9105 ];
9106
9107 let mut parts: Vec<String> = text.split_whitespace().map(str::to_string).collect();
9108 if parts.len() < 2 {
9109 return text.to_string();
9110 }
9111
9112 let mut i = 0usize;
9113 while i + 1 < parts.len() {
9114 let left = parts[i].clone();
9115 let right = parts[i + 1].clone();
9116 let left_clean = left.trim_matches(|c: char| !c.is_alphabetic());
9117 let right_clean = right.trim_matches(|c: char| !c.is_alphabetic());
9118 let left_lower = left_clean.to_ascii_lowercase();
9119 let right_lower = right_clean.to_ascii_lowercase();
9120
9121 let should_join = !left_clean.is_empty()
9122 && !right_clean.is_empty()
9123 && left_clean.chars().all(char::is_alphabetic)
9124 && right_clean.chars().all(char::is_alphabetic)
9125 && (left_clean.len() <= 4 || right_clean.len() <= 4)
9126 && left_clean.len() + right_clean.len() >= 6
9127 && !right_clean.chars().next().is_some_and(char::is_uppercase)
9128 && !STOPWORDS.contains(&left_lower.as_str())
9129 && !STOPWORDS.contains(&right_lower.as_str());
9130
9131 if should_join {
9132 let next = parts.remove(i + 1);
9133 parts[i].push_str(&next);
9134 } else {
9135 i += 1;
9136 }
9137 }
9138
9139 parts.join(" ")
9140}
9141
9142fn list_item_text_from_contents(contents: &[ContentElement]) -> String {
9144 let mut text = String::new();
9145 for elem in contents {
9146 let part = match elem {
9147 ContentElement::Paragraph(p) => p.base.value(),
9148 ContentElement::TextBlock(tb) => tb.value(),
9149 ContentElement::TextLine(tl) => tl.value(),
9150 ContentElement::TextChunk(tc) => tc.value.clone(),
9151 _ => String::new(),
9152 };
9153 if !text.is_empty() && !part.is_empty() {
9154 text.push(' ');
9155 }
9156 text.push_str(&part);
9157 }
9158 text
9159}
9160
9161fn has_internal_header_gap(row: &[String]) -> bool {
9162 let mut seen_filled = false;
9163 let mut seen_gap_after_fill = false;
9164 for cell in row {
9165 if cell.trim().is_empty() {
9166 if seen_filled {
9167 seen_gap_after_fill = true;
9168 }
9169 continue;
9170 }
9171 if seen_gap_after_fill {
9172 return true;
9173 }
9174 seen_filled = true;
9175 }
9176 false
9177}
9178
9179fn expand_grouped_header_row(parent: &[String], child: &[String]) -> Vec<String> {
9180 let anchor_cols: Vec<usize> = parent
9181 .iter()
9182 .enumerate()
9183 .filter_map(|(idx, cell)| (!cell.trim().is_empty()).then_some(idx))
9184 .collect();
9185 if anchor_cols.is_empty() {
9186 return parent.to_vec();
9187 }
9188
9189 let mut expanded = parent.to_vec();
9190 for (col_idx, child_cell) in child.iter().enumerate() {
9191 if !expanded[col_idx].trim().is_empty() || child_cell.trim().is_empty() {
9192 continue;
9193 }
9194
9195 let mut best_anchor = anchor_cols[0];
9196 let mut best_distance = usize::abs_diff(anchor_cols[0], col_idx);
9197 for &anchor_idx in &anchor_cols[1..] {
9198 let distance = usize::abs_diff(anchor_idx, col_idx);
9199 if distance < best_distance || (distance == best_distance && anchor_idx > best_anchor) {
9200 best_anchor = anchor_idx;
9201 best_distance = distance;
9202 }
9203 }
9204 expanded[col_idx] = parent[best_anchor].trim().to_string();
9205 }
9206
9207 expanded
9208}
9209
9210fn preserve_grouped_header_rows(rows: &mut [Vec<String>]) -> bool {
9211 if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
9212 return false;
9213 }
9214 if rows[0].first().is_none_or(|cell| cell.trim().is_empty()) {
9215 return false;
9216 }
9217 if rows[1].first().is_some_and(|cell| !cell.trim().is_empty()) {
9218 return false;
9219 }
9220
9221 let first_filled = rows[0]
9222 .iter()
9223 .filter(|cell| !cell.trim().is_empty())
9224 .count();
9225 let second_filled = rows[1]
9226 .iter()
9227 .filter(|cell| !cell.trim().is_empty())
9228 .count();
9229 if first_filled < 2 || second_filled <= first_filled || !has_internal_header_gap(&rows[0]) {
9230 return false;
9231 }
9232
9233 rows[0] = expand_grouped_header_row(&rows[0], &rows[1]);
9234 true
9235}
9236
9237fn merge_continuation_rows(rows: &mut Vec<Vec<String>>) {
9248 if rows.len() < 2 {
9249 return;
9250 }
9251 if preserve_grouped_header_rows(rows) {
9252 return;
9253 }
9254 if rows[0].first().is_none_or(|c| c.trim().is_empty()) {
9256 return;
9257 }
9258
9259 let mut merge_count = 0usize;
9260 for (i, row_i) in rows.iter().enumerate().skip(1) {
9261 let first_empty = row_i.first().is_none_or(|c| c.trim().is_empty());
9262 if !first_empty {
9263 break; }
9265 let all_short = row_i
9267 .iter()
9268 .all(|c| c.trim().is_empty() || c.trim().len() <= 30);
9269 if !all_short {
9270 break;
9271 }
9272 merge_count = i;
9273 }
9274
9275 if merge_count == 0 {
9278 return;
9279 }
9280
9281 for i in 1..=merge_count {
9283 let (head, tail) = rows.split_at_mut(i);
9284 let ncols = head[0].len().min(tail[0].len());
9285 for (target, src) in head[0]
9286 .iter_mut()
9287 .take(ncols)
9288 .zip(tail[0].iter().take(ncols))
9289 {
9290 let fragment = src.trim().to_string();
9291 if !fragment.is_empty() {
9292 let target_str = target.trim().to_string();
9293 *target = if target_str.is_empty() {
9294 fragment
9295 } else {
9296 format!("{} {}", target_str, fragment)
9297 };
9298 }
9299 }
9300 }
9301
9302 rows.drain(1..=merge_count);
9304}
9305
9306fn trim_leading_table_carryover_rows(rows: &mut Vec<Vec<String>>) {
9307 while first_body_row_looks_like_carryover(rows) {
9308 rows.remove(1);
9309 }
9310}
9311
9312fn first_body_row_looks_like_carryover(rows: &[Vec<String>]) -> bool {
9313 if rows.len() < 3 {
9314 return false;
9315 }
9316
9317 let key_col_count = infer_leading_key_column_count(&rows[1..]);
9318 if key_col_count == 0 {
9319 return false;
9320 }
9321
9322 let candidate = &rows[1];
9323 if candidate
9324 .iter()
9325 .take(key_col_count)
9326 .any(|cell| !cell.trim().is_empty())
9327 {
9328 return false;
9329 }
9330
9331 let non_empty_cols = candidate
9332 .iter()
9333 .enumerate()
9334 .filter(|(_, cell)| !cell.trim().is_empty())
9335 .map(|(idx, _)| idx)
9336 .collect::<Vec<_>>();
9337 if non_empty_cols.len() != 1 {
9338 return false;
9339 }
9340
9341 let only_col = non_empty_cols[0];
9342 if only_col < key_col_count {
9343 return false;
9344 }
9345
9346 if candidate[only_col].split_whitespace().count() < 4 {
9347 return false;
9348 }
9349
9350 rows[2]
9351 .iter()
9352 .take(key_col_count)
9353 .all(|cell| !cell.trim().is_empty())
9354}
9355
9356fn infer_leading_key_column_count(rows: &[Vec<String>]) -> usize {
9357 if rows.len() < 2 {
9358 return 0;
9359 }
9360
9361 let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
9362 let mut key_cols = 0usize;
9363
9364 for col_idx in 0..num_cols {
9365 let mut occupancy = 0usize;
9366 let mut word_counts = Vec::new();
9367
9368 for row in rows {
9369 let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
9370 let trimmed = cell.trim();
9371 if trimmed.is_empty() {
9372 continue;
9373 }
9374 occupancy += 1;
9375 word_counts.push(trimmed.split_whitespace().count());
9376 }
9377
9378 if occupancy == 0 {
9379 break;
9380 }
9381
9382 word_counts.sort_unstable();
9383 let median_words = word_counts[word_counts.len() / 2];
9384 let occupancy_ratio = occupancy as f64 / rows.len() as f64;
9385 if occupancy_ratio < 0.6 || median_words > 3 {
9386 break;
9387 }
9388 key_cols += 1;
9389 }
9390
9391 key_cols
9392}
9393
9394fn render_table(out: &mut String, table: &crate::models::semantic::SemanticTable) {
9396 render_table_border(out, &table.table_border);
9398}
9399
9400#[derive(Clone, Debug)]
9401struct GeometricTableRegion {
9402 start_idx: usize,
9403 end_idx: usize,
9404 rendered: String,
9405}
9406
9407#[derive(Clone)]
9408struct ChunkLine {
9409 bbox: BoundingBox,
9410 chunks: Vec<TextChunk>,
9411}
9412
9413#[derive(Clone)]
9414struct SlotFragment {
9415 slot_idx: usize,
9416 bbox: BoundingBox,
9417 text: String,
9418}
9419
9420fn detect_geometric_table_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
9421 let mut regions = Vec::new();
9422 let mut occupied_until = 0usize;
9423
9424 for (idx, element) in doc.kids.iter().enumerate() {
9425 if idx < occupied_until {
9426 continue;
9427 }
9428
9429 let Some(table) = table_border_from_element(element) else {
9430 continue;
9431 };
9432 let Some(region) = build_geometric_table_region(doc, idx, table) else {
9433 continue;
9434 };
9435 occupied_until = region.end_idx.saturating_add(1);
9436 regions.push(region);
9437 }
9438
9439 let mut occupied = regions
9440 .iter()
9441 .flat_map(|region| region.start_idx..=region.end_idx)
9442 .collect::<HashSet<_>>();
9443 for region in detect_footnote_citation_regions(doc) {
9444 if (region.start_idx..=region.end_idx).any(|idx| occupied.contains(&idx)) {
9445 continue;
9446 }
9447 occupied.extend(region.start_idx..=region.end_idx);
9448 regions.push(region);
9449 }
9450
9451 regions.sort_by_key(|region| region.start_idx);
9452 regions
9453}
9454
9455fn detect_footnote_citation_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
9456 let body_font_size = compute_running_body_font_size(doc);
9457 if body_font_size <= 0.0 {
9458 return Vec::new();
9459 }
9460
9461 let mut regions = Vec::new();
9462 let mut idx = 0usize;
9463 while idx < doc.kids.len() {
9464 let Some(region) = build_footnote_citation_region(doc, idx, body_font_size) else {
9465 idx += 1;
9466 continue;
9467 };
9468 idx = region.end_idx.saturating_add(1);
9469 regions.push(region);
9470 }
9471
9472 regions
9473}
9474
9475fn compute_running_body_font_size(doc: &PdfDocument) -> f64 {
9476 doc.kids
9477 .iter()
9478 .filter_map(|element| {
9479 let ContentElement::Paragraph(paragraph) = element else {
9480 return None;
9481 };
9482 let text = paragraph.base.value();
9483 (text.split_whitespace().count() > 10).then_some(paragraph.base.font_size?)
9484 })
9485 .fold(0.0_f64, f64::max)
9486}
9487
9488fn build_footnote_citation_region(
9489 doc: &PdfDocument,
9490 start_idx: usize,
9491 body_font_size: f64,
9492) -> Option<GeometricTableRegion> {
9493 let element = doc.kids.get(start_idx)?;
9494 if !is_geometric_text_candidate(element) {
9495 return None;
9496 }
9497
9498 let start_text = extract_element_text(element);
9499 let trimmed_start = start_text.trim();
9500 if trimmed_start.is_empty() {
9501 return None;
9502 }
9503
9504 let small_font_threshold = (body_font_size * 0.92).min(body_font_size - 0.8).max(0.0);
9505 let mut lead_prefix = None;
9506 let mut fragments = Vec::new();
9507 let page_number = element.page_number()?;
9508 let mut column_bbox = element.bbox().clone();
9509 let mut region_start_idx = start_idx;
9510 let mut end_idx = start_idx;
9511
9512 if element_font_size(element).is_some_and(|font_size| font_size <= small_font_threshold)
9513 && starts_with_footnote_marker(trimmed_start)
9514 {
9515 if let Some((attach_idx, prefix, leading_fragments)) = leading_footnote_attachment(
9516 doc,
9517 start_idx,
9518 page_number,
9519 &column_bbox,
9520 small_font_threshold,
9521 ) {
9522 lead_prefix = Some(prefix);
9523 fragments.extend(leading_fragments);
9524 region_start_idx = attach_idx;
9525 }
9526 fragments.push(footnote_fragment_text(element));
9527 } else {
9528 let (prefix, first_tail) = split_trailing_footnote_lead(trimmed_start)?;
9529 let next = doc.kids.get(start_idx + 1)?;
9530 if !is_geometric_text_candidate(next)
9531 || next.page_number() != Some(page_number)
9532 || !element_font_size(next).is_some_and(|font_size| font_size <= small_font_threshold)
9533 {
9534 return None;
9535 }
9536 if !same_column_region(&column_bbox, next.bbox()) {
9537 return None;
9538 }
9539 lead_prefix = Some(prefix);
9540 fragments.push(first_tail);
9541 }
9542
9543 let mut consecutive_small = 0usize;
9544 for idx in start_idx + 1..doc.kids.len() {
9545 let candidate = &doc.kids[idx];
9546 if !is_geometric_text_candidate(candidate) || candidate.page_number() != Some(page_number) {
9547 break;
9548 }
9549
9550 let candidate_text = extract_element_text(candidate);
9551 let trimmed = candidate_text.trim();
9552 if trimmed.is_empty() || starts_with_caption_prefix(trimmed) {
9553 break;
9554 }
9555
9556 let Some(font_size) = element_font_size(candidate) else {
9557 break;
9558 };
9559 if font_size > small_font_threshold {
9560 break;
9561 }
9562 if !same_column_region(&column_bbox, candidate.bbox()) {
9563 break;
9564 }
9565
9566 column_bbox = column_bbox.union(candidate.bbox());
9567 fragments.push(footnote_fragment_text(candidate));
9568 consecutive_small += 1;
9569 end_idx = idx;
9570 }
9571
9572 if consecutive_small == 0 && lead_prefix.is_some() {
9573 return None;
9574 }
9575
9576 let rows = parse_footnote_citation_rows(&fragments);
9577 if rows.len() < 3 {
9578 return None;
9579 }
9580
9581 let numeric_markers = rows
9582 .iter()
9583 .filter_map(|(marker, _)| marker.parse::<u32>().ok())
9584 .collect::<Vec<_>>();
9585 if numeric_markers.len() != rows.len() {
9586 return None;
9587 }
9588 let sequential_steps = numeric_markers
9589 .windows(2)
9590 .filter(|pair| pair[1] == pair[0] + 1)
9591 .count();
9592 if sequential_steps + 1 < rows.len().saturating_sub(1) {
9593 return None;
9594 }
9595
9596 let mut rendered_rows = vec![vec!["Footnote".to_string(), "Citation".to_string()]];
9597 rendered_rows.extend(
9598 rows.into_iter()
9599 .map(|(marker, citation)| vec![marker, citation]),
9600 );
9601
9602 let mut rendered = String::new();
9603 if let Some(prefix) = lead_prefix {
9604 rendered.push_str(&escape_md_line_start(prefix.trim()));
9605 rendered.push_str("\n\n");
9606 }
9607 rendered.push_str(&render_html_table(&rendered_rows));
9608
9609 Some(GeometricTableRegion {
9610 start_idx: region_start_idx,
9611 end_idx,
9612 rendered,
9613 })
9614}
9615
9616fn leading_footnote_attachment(
9617 doc: &PdfDocument,
9618 start_idx: usize,
9619 page_number: u32,
9620 column_bbox: &BoundingBox,
9621 small_font_threshold: f64,
9622) -> Option<(usize, String, Vec<String>)> {
9623 let mut idx = start_idx.checked_sub(1)?;
9624 let mut leading_fragments = Vec::new();
9625 let mut scanned = 0usize;
9626
9627 loop {
9628 let candidate = doc.kids.get(idx)?;
9629 scanned += 1;
9630 if scanned > 6 || candidate.page_number() != Some(page_number) {
9631 return None;
9632 }
9633
9634 if !is_geometric_text_candidate(candidate) {
9635 if idx == 0 {
9636 return None;
9637 }
9638 idx -= 1;
9639 continue;
9640 }
9641
9642 let text = extract_element_text(candidate);
9643 let trimmed = text.trim();
9644 if trimmed.is_empty() {
9645 if idx == 0 {
9646 return None;
9647 }
9648 idx -= 1;
9649 continue;
9650 }
9651 if !same_column_region(candidate.bbox(), column_bbox) {
9652 return None;
9653 }
9654
9655 if element_font_size(candidate).is_some_and(|font_size| font_size <= small_font_threshold) {
9656 leading_fragments.push(footnote_fragment_text(candidate));
9657 if idx == 0 {
9658 return None;
9659 }
9660 idx -= 1;
9661 continue;
9662 }
9663
9664 let (prefix, first_tail) = split_trailing_footnote_lead(trimmed)?;
9665 leading_fragments.push(first_tail);
9666 leading_fragments.reverse();
9667 return Some((idx, prefix, leading_fragments));
9668 }
9669}
9670
9671fn parse_footnote_citation_rows(fragments: &[String]) -> Vec<(String, String)> {
9672 let mut rows = Vec::new();
9673 let mut current_marker = None::<String>;
9674 let mut current_citation = String::new();
9675
9676 for fragment in fragments {
9677 let markers = find_footnote_marker_positions(fragment);
9678 if markers.is_empty() {
9679 if current_marker.is_some() {
9680 merge_paragraph_text(&mut current_citation, fragment.trim());
9681 }
9682 continue;
9683 }
9684
9685 let mut cursor = 0usize;
9686 for (pos, marker, skip_len) in markers {
9687 let prefix = fragment[cursor..pos].trim();
9688 if current_marker.is_some() && !prefix.is_empty() {
9689 merge_paragraph_text(&mut current_citation, prefix);
9690 }
9691 if let Some(marker_value) = current_marker.take() {
9692 let trimmed = current_citation.trim();
9693 if !trimmed.is_empty() {
9694 rows.push((marker_value, trimmed.to_string()));
9695 }
9696 current_citation.clear();
9697 }
9698 current_marker = Some(marker);
9699 cursor = pos + skip_len;
9700 }
9701
9702 let tail = fragment[cursor..].trim();
9703 if current_marker.is_some() && !tail.is_empty() {
9704 merge_paragraph_text(&mut current_citation, tail);
9705 }
9706 }
9707
9708 if let Some(marker_value) = current_marker {
9709 let trimmed = current_citation.trim();
9710 if !trimmed.is_empty() {
9711 rows.push((marker_value, trimmed.to_string()));
9712 }
9713 }
9714
9715 rebalance_adjacent_footnote_citations(&mut rows);
9716 rows
9717}
9718
9719fn rebalance_adjacent_footnote_citations(rows: &mut [(String, String)]) {
9720 for idx in 0..rows.len().saturating_sub(1) {
9721 if !rows[idx].1.trim_end().ends_with(',') {
9722 continue;
9723 }
9724
9725 let next = rows[idx + 1].1.trim().to_string();
9726 let Some((stub, remainder)) = split_leading_citation_stub(&next) else {
9727 continue;
9728 };
9729 let Some((first_sentence, trailing)) = split_first_sentence(remainder) else {
9730 continue;
9731 };
9732 if first_sentence.split_whitespace().count() < 2 {
9733 continue;
9734 }
9735
9736 merge_paragraph_text(&mut rows[idx].1, first_sentence);
9737 rows[idx + 1].1 = if trailing.is_empty() {
9738 stub.to_string()
9739 } else {
9740 format!("{stub} {trailing}")
9741 };
9742 }
9743}
9744
9745fn split_leading_citation_stub(text: &str) -> Option<(&str, &str)> {
9746 let comma_idx = text.find(',')?;
9747 if comma_idx > 8 {
9748 return None;
9749 }
9750 let stub = text[..=comma_idx].trim();
9751 let remainder = text[comma_idx + 1..].trim();
9752 (!stub.is_empty() && !remainder.is_empty()).then_some((stub, remainder))
9753}
9754
9755fn split_first_sentence(text: &str) -> Option<(&str, &str)> {
9756 let period_idx = text.find(". ")?;
9757 let first = text[..=period_idx].trim();
9758 let trailing = text[period_idx + 2..].trim();
9759 (!first.is_empty()).then_some((first, trailing))
9760}
9761
9762fn find_footnote_marker_positions(text: &str) -> Vec<(usize, String, usize)> {
9763 let chars = text.char_indices().collect::<Vec<_>>();
9764 let mut markers = Vec::new();
9765 let mut idx = 0usize;
9766
9767 while idx < chars.len() {
9768 let (byte_idx, ch) = chars[idx];
9769 if !ch.is_ascii_digit() {
9770 idx += 1;
9771 continue;
9772 }
9773
9774 let at_boundary = idx == 0
9775 || chars[idx - 1].1.is_whitespace()
9776 || matches!(
9777 chars[idx - 1].1,
9778 '.' | ',' | ';' | ':' | ')' | ']' | '"' | '\'' | '”'
9779 );
9780 if !at_boundary {
9781 idx += 1;
9782 continue;
9783 }
9784
9785 let mut end_idx = idx;
9786 while end_idx < chars.len() && chars[end_idx].1.is_ascii_digit() {
9787 end_idx += 1;
9788 }
9789 let digits = &text[byte_idx
9790 ..chars
9791 .get(end_idx)
9792 .map(|(pos, _)| *pos)
9793 .unwrap_or(text.len())];
9794 if digits.len() > 2 || end_idx >= chars.len() || !chars[end_idx].1.is_whitespace() {
9795 idx += 1;
9796 continue;
9797 }
9798
9799 let mut lookahead = end_idx;
9800 while lookahead < chars.len() && chars[lookahead].1.is_whitespace() {
9801 lookahead += 1;
9802 }
9803 let Some((_, next_ch)) = chars.get(lookahead) else {
9804 idx += 1;
9805 continue;
9806 };
9807 if !(next_ch.is_ascii_uppercase() || matches!(*next_ch, '(' | '[' | '*')) {
9808 idx += 1;
9809 continue;
9810 }
9811
9812 let skip_end = chars
9813 .get(lookahead)
9814 .map(|(pos, _)| *pos)
9815 .unwrap_or(text.len());
9816 markers.push((byte_idx, digits.to_string(), skip_end - byte_idx));
9817 idx = lookahead;
9818 }
9819
9820 markers
9821}
9822
9823fn split_trailing_footnote_lead(text: &str) -> Option<(String, String)> {
9824 let markers = find_footnote_marker_positions(text);
9825 let (pos, marker, skip_len) = markers.last()?.clone();
9826 let prefix = text[..pos].trim();
9827 let tail = text[pos + skip_len..].trim();
9828 if prefix.split_whitespace().count() < 6 || tail.split_whitespace().count() > 6 {
9829 return None;
9830 }
9831 Some((prefix.to_string(), format!("{marker} {tail}")))
9832}
9833
9834fn starts_with_footnote_marker(text: &str) -> bool {
9835 find_footnote_marker_positions(text)
9836 .first()
9837 .is_some_and(|(pos, _, _)| *pos == 0)
9838}
9839
9840fn same_column_region(left: &BoundingBox, right: &BoundingBox) -> bool {
9841 let overlap = (left.right_x.min(right.right_x) - left.left_x.max(right.left_x)).max(0.0);
9842 let min_width = left.width().min(right.width()).max(1.0);
9843 overlap / min_width >= 0.35 || (left.left_x - right.left_x).abs() <= 28.0
9844}
9845
9846fn footnote_fragment_text(element: &ContentElement) -> String {
9847 let text = extract_element_text(element);
9848 if element_font_name(element)
9849 .as_deref()
9850 .is_some_and(|name| name.to_ascii_lowercase().contains("italic"))
9851 {
9852 format!("*{}*", text.trim())
9853 } else {
9854 text
9855 }
9856}
9857
9858fn element_font_size(element: &ContentElement) -> Option<f64> {
9859 match element {
9860 ContentElement::Paragraph(p) => p.base.font_size,
9861 ContentElement::Heading(h) => h.base.base.font_size,
9862 ContentElement::NumberHeading(nh) => nh.base.base.base.font_size,
9863 ContentElement::TextBlock(tb) => Some(tb.font_size),
9864 ContentElement::TextLine(tl) => Some(tl.font_size),
9865 _ => None,
9866 }
9867}
9868
9869fn element_font_name(element: &ContentElement) -> Option<String> {
9870 match element {
9871 ContentElement::Paragraph(p) => p.base.font_name.clone(),
9872 ContentElement::Heading(h) => h.base.base.font_name.clone(),
9873 ContentElement::NumberHeading(nh) => nh.base.base.base.font_name.clone(),
9874 _ => None,
9875 }
9876}
9877
9878fn table_border_from_element(
9879 element: &ContentElement,
9880) -> Option<&crate::models::table::TableBorder> {
9881 match element {
9882 ContentElement::TableBorder(table) => Some(table),
9883 ContentElement::Table(table) => Some(&table.table_border),
9884 _ => None,
9885 }
9886}
9887
9888fn build_geometric_table_region(
9889 doc: &PdfDocument,
9890 table_idx: usize,
9891 table: &crate::models::table::TableBorder,
9892) -> Option<GeometricTableRegion> {
9893 let mut table_rows = collect_table_border_rows(table);
9894 if table_rows.is_empty() || table.num_columns < 3 {
9895 return None;
9896 }
9897 merge_continuation_rows(&mut table_rows);
9898
9899 let column_ranges = table_column_ranges(table)?;
9900 let candidate_indices = collect_table_header_candidate_indices(doc, table_idx, table);
9901 if candidate_indices.is_empty() {
9902 return None;
9903 }
9904
9905 let needs_external_stub =
9906 infer_left_stub_requirement(doc, &candidate_indices, &table_rows, &column_ranges);
9907 let supports_embedded_stub_header =
9908 supports_embedded_stub_header(&table_rows, &column_ranges, doc, &candidate_indices);
9909 if !needs_external_stub && !supports_embedded_stub_header {
9910 return None;
9911 }
9912 let slot_ranges = if needs_external_stub {
9913 slot_ranges(&column_ranges, doc, &candidate_indices, true)?
9914 } else {
9915 column_ranges.clone()
9916 };
9917 let mut header_rows = reconstruct_aligned_rows(doc, &candidate_indices, &slot_ranges, true, 2);
9918 if header_rows.is_empty() {
9919 return None;
9920 }
9921 if needs_external_stub {
9922 normalize_leading_stub_header(&mut header_rows);
9923 } else {
9924 promote_embedded_stub_header(&mut header_rows, &table_rows);
9925 }
9926
9927 let slot_count = slot_ranges.len();
9928 let dense_header_rows = header_rows
9929 .iter()
9930 .filter(|row| {
9931 row.iter().filter(|cell| !cell.trim().is_empty()).count()
9932 >= slot_count.saturating_sub(1).max(2)
9933 })
9934 .count();
9935 if dense_header_rows == 0 {
9936 return None;
9937 }
9938
9939 let mut combined_rows = Vec::new();
9940 combined_rows.extend(header_rows);
9941
9942 let following_indices = collect_table_footer_candidate_indices(doc, table_idx, table);
9943 let body_rows = if needs_external_stub && should_merge_panel_body_rows(&table_rows) {
9944 let trailing_rows =
9945 reconstruct_aligned_rows(doc, &following_indices, &slot_ranges, false, 1);
9946 vec![merge_panel_body_row(
9947 &table_rows,
9948 &trailing_rows,
9949 slot_count,
9950 )]
9951 } else if needs_external_stub {
9952 table_rows
9953 .iter()
9954 .map(|row| {
9955 let mut shifted = vec![String::new()];
9956 shifted.extend(row.iter().cloned());
9957 shifted
9958 })
9959 .collect()
9960 } else {
9961 table_rows
9962 };
9963
9964 if body_rows.is_empty() {
9965 return None;
9966 }
9967 combined_rows.extend(body_rows);
9968
9969 let rendered = render_pipe_rows(&combined_rows);
9970 Some(GeometricTableRegion {
9971 start_idx: candidate_indices[0],
9972 end_idx: following_indices.last().copied().unwrap_or(table_idx),
9973 rendered,
9974 })
9975}
9976
9977fn table_column_ranges(table: &crate::models::table::TableBorder) -> Option<Vec<(f64, f64)>> {
9978 if table.num_columns == 0 {
9979 return None;
9980 }
9981
9982 let mut ranges = vec![(f64::INFINITY, f64::NEG_INFINITY); table.num_columns];
9983 for row in &table.rows {
9984 for cell in &row.cells {
9985 if cell.col_number >= table.num_columns {
9986 continue;
9987 }
9988 let range = &mut ranges[cell.col_number];
9989 range.0 = range.0.min(cell.bbox.left_x);
9990 range.1 = range.1.max(cell.bbox.right_x);
9991 }
9992 }
9993
9994 if ranges
9995 .iter()
9996 .any(|(left, right)| !left.is_finite() || !right.is_finite() || right <= left)
9997 {
9998 return None;
9999 }
10000
10001 Some(ranges)
10002}
10003
10004fn collect_table_header_candidate_indices(
10005 doc: &PdfDocument,
10006 table_idx: usize,
10007 table: &crate::models::table::TableBorder,
10008) -> Vec<usize> {
10009 let mut indices = Vec::new();
10010 let table_page = table.bbox.page_number;
10011 let table_top = table.bbox.top_y;
10012 let mut cursor = table_idx;
10013
10014 while let Some(prev_idx) = cursor.checked_sub(1) {
10015 let element = &doc.kids[prev_idx];
10016 if element.page_number() != table_page {
10017 break;
10018 }
10019 if !is_geometric_text_candidate(element) {
10020 break;
10021 }
10022
10023 let bbox = element.bbox();
10024 let vertical_gap = bbox.bottom_y - table_top;
10025 if !(-6.0..=260.0).contains(&vertical_gap) {
10026 break;
10027 }
10028
10029 indices.push(prev_idx);
10030 cursor = prev_idx;
10031 if indices.len() >= 10 {
10032 break;
10033 }
10034 }
10035
10036 indices.reverse();
10037 indices
10038}
10039
10040fn collect_table_footer_candidate_indices(
10041 doc: &PdfDocument,
10042 table_idx: usize,
10043 table: &crate::models::table::TableBorder,
10044) -> Vec<usize> {
10045 let mut indices = Vec::new();
10046 let table_page = table.bbox.page_number;
10047 let table_bottom = table.bbox.bottom_y;
10048
10049 for idx in table_idx + 1..doc.kids.len() {
10050 let element = &doc.kids[idx];
10051 if element.page_number() != table_page {
10052 break;
10053 }
10054 if !is_geometric_text_candidate(element) {
10055 break;
10056 }
10057 if looks_like_margin_page_number(doc, element, &extract_element_text(element)) {
10058 break;
10059 }
10060
10061 let bbox = element.bbox();
10062 let gap = table_bottom - bbox.top_y;
10063 if !(-6.0..=28.0).contains(&gap) {
10064 break;
10065 }
10066 indices.push(idx);
10067 if indices.len() >= 4 {
10068 break;
10069 }
10070 }
10071
10072 indices
10073}
10074
10075fn is_geometric_text_candidate(element: &ContentElement) -> bool {
10076 matches!(
10077 element,
10078 ContentElement::Paragraph(_)
10079 | ContentElement::Heading(_)
10080 | ContentElement::NumberHeading(_)
10081 | ContentElement::TextBlock(_)
10082 | ContentElement::TextLine(_)
10083 )
10084}
10085
10086fn infer_left_stub_requirement(
10087 doc: &PdfDocument,
10088 candidate_indices: &[usize],
10089 table_rows: &[Vec<String>],
10090 column_ranges: &[(f64, f64)],
10091) -> bool {
10092 if column_ranges.is_empty() {
10093 return false;
10094 }
10095
10096 let first_width = (column_ranges[0].1 - column_ranges[0].0).max(1.0);
10097 let has_left_label = candidate_indices.iter().any(|idx| {
10098 let bbox = doc.kids[*idx].bbox();
10099 bbox.right_x <= column_ranges[0].0 + first_width * 0.12
10100 && bbox.width() <= first_width * 0.45
10101 });
10102 if !has_left_label {
10103 return false;
10104 }
10105
10106 let mut first_col_word_counts: Vec<usize> = table_rows
10107 .iter()
10108 .filter_map(|row| row.first())
10109 .map(|cell| cell.split_whitespace().count())
10110 .collect();
10111 if first_col_word_counts.is_empty() {
10112 return false;
10113 }
10114 first_col_word_counts.sort_unstable();
10115 let median = first_col_word_counts[first_col_word_counts.len() / 2];
10116 median >= 5
10117}
10118
10119fn supports_embedded_stub_header(
10120 table_rows: &[Vec<String>],
10121 column_ranges: &[(f64, f64)],
10122 doc: &PdfDocument,
10123 candidate_indices: &[usize],
10124) -> bool {
10125 if table_rows.len() < 2 || column_ranges.len() < 3 {
10126 return false;
10127 }
10128
10129 let first_row = &table_rows[0];
10130 if first_row.len() != column_ranges.len() || first_row[0].trim().is_empty() {
10131 return false;
10132 }
10133 if first_row[0].split_whitespace().count() > 3 || first_row[0].trim().len() > 24 {
10134 return false;
10135 }
10136
10137 let data_fill = first_row
10138 .iter()
10139 .skip(1)
10140 .filter(|cell| !cell.trim().is_empty())
10141 .count();
10142 if data_fill + 1 < column_ranges.len() {
10143 return false;
10144 }
10145
10146 let labeled_rows = table_rows
10147 .iter()
10148 .skip(1)
10149 .filter(|row| row.first().is_some_and(|cell| !cell.trim().is_empty()))
10150 .count();
10151 if labeled_rows == 0 {
10152 return false;
10153 }
10154
10155 let slot_ranges = column_ranges.to_vec();
10156 let header_rows = reconstruct_aligned_rows(doc, candidate_indices, &slot_ranges, true, 2);
10157 header_rows.iter().any(|row| {
10158 row.first().is_none_or(|cell| cell.trim().is_empty())
10159 && row
10160 .iter()
10161 .skip(1)
10162 .filter(|cell| !cell.trim().is_empty())
10163 .count()
10164 >= column_ranges.len().saturating_sub(1)
10165 })
10166}
10167
10168fn slot_ranges(
10169 column_ranges: &[(f64, f64)],
10170 doc: &PdfDocument,
10171 candidate_indices: &[usize],
10172 needs_stub: bool,
10173) -> Option<Vec<(f64, f64)>> {
10174 let mut slots = Vec::new();
10175 if needs_stub {
10176 let first_left = column_ranges.first()?.0;
10177 let left_stub_start = candidate_indices
10178 .iter()
10179 .map(|idx| doc.kids[*idx].bbox().left_x)
10180 .fold(first_left, f64::min);
10181 let stub_right = first_left - 1.0;
10182 if stub_right <= left_stub_start {
10183 return None;
10184 }
10185 slots.push((left_stub_start, stub_right));
10186 }
10187 slots.extend(column_ranges.iter().copied());
10188 Some(slots)
10189}
10190
10191fn reconstruct_aligned_rows(
10192 doc: &PdfDocument,
10193 candidate_indices: &[usize],
10194 slot_ranges: &[(f64, f64)],
10195 drop_wide_singletons: bool,
10196 min_filled_slots: usize,
10197) -> Vec<Vec<String>> {
10198 if candidate_indices.is_empty() || slot_ranges.is_empty() {
10199 return Vec::new();
10200 }
10201
10202 let mut row_bands: Vec<(BoundingBox, Vec<String>)> = Vec::new();
10203
10204 for idx in candidate_indices {
10205 for line in extract_chunk_lines(&doc.kids[*idx]) {
10206 let fragments = split_line_into_slot_fragments(&line, slot_ranges);
10207 if fragments.is_empty() {
10208 continue;
10209 }
10210
10211 if drop_wide_singletons && fragments.len() == 1 {
10212 let only = &fragments[0];
10213 let span_width = only.bbox.width();
10214 let table_width =
10215 slot_ranges.last().map(|(_, right)| *right).unwrap_or(0.0) - slot_ranges[0].0;
10216 if span_width >= table_width * 0.55 {
10217 continue;
10218 }
10219 }
10220
10221 let line_center = line.bbox.center_y();
10222 let tolerance = line
10223 .chunks
10224 .iter()
10225 .map(|chunk| chunk.font_size)
10226 .fold(8.0, f64::max)
10227 * 0.8;
10228
10229 let mut target_row = None;
10230 for (row_idx, (bbox, _)) in row_bands.iter().enumerate() {
10231 if (bbox.center_y() - line_center).abs() <= tolerance {
10232 target_row = Some(row_idx);
10233 break;
10234 }
10235 }
10236
10237 if let Some(row_idx) = target_row {
10238 let (bbox, cells) = &mut row_bands[row_idx];
10239 *bbox = bbox.union(&line.bbox);
10240 for fragment in fragments {
10241 append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
10242 }
10243 } else {
10244 let mut cells = vec![String::new(); slot_ranges.len()];
10245 for fragment in fragments {
10246 append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
10247 }
10248 row_bands.push((line.bbox.clone(), cells));
10249 }
10250 }
10251 }
10252
10253 row_bands.sort_by(|left, right| {
10254 right
10255 .0
10256 .top_y
10257 .partial_cmp(&left.0.top_y)
10258 .unwrap_or(std::cmp::Ordering::Equal)
10259 });
10260
10261 row_bands
10262 .into_iter()
10263 .map(|(_, cells)| cells)
10264 .filter(|cells| {
10265 let filled = cells.iter().filter(|cell| !cell.trim().is_empty()).count();
10266 filled >= min_filled_slots
10267 })
10268 .collect()
10269}
10270
10271fn extract_chunk_lines(element: &ContentElement) -> Vec<ChunkLine> {
10272 match element {
10273 ContentElement::Paragraph(p) => chunk_lines_from_semantic_node(&p.base),
10274 ContentElement::Heading(h) => chunk_lines_from_semantic_node(&h.base.base),
10275 ContentElement::NumberHeading(nh) => chunk_lines_from_semantic_node(&nh.base.base.base),
10276 ContentElement::TextBlock(tb) => tb
10277 .text_lines
10278 .iter()
10279 .map(|line| ChunkLine {
10280 bbox: line.bbox.clone(),
10281 chunks: line.text_chunks.clone(),
10282 })
10283 .collect(),
10284 ContentElement::TextLine(tl) => vec![ChunkLine {
10285 bbox: tl.bbox.clone(),
10286 chunks: tl.text_chunks.clone(),
10287 }],
10288 _ => Vec::new(),
10289 }
10290}
10291
10292fn chunk_lines_from_semantic_node(node: &SemanticTextNode) -> Vec<ChunkLine> {
10293 let mut lines = Vec::new();
10294 for column in &node.columns {
10295 for block in &column.text_blocks {
10296 for line in &block.text_lines {
10297 lines.push(ChunkLine {
10298 bbox: line.bbox.clone(),
10299 chunks: line.text_chunks.clone(),
10300 });
10301 }
10302 }
10303 }
10304 lines
10305}
10306
10307fn split_line_into_slot_fragments(
10308 line: &ChunkLine,
10309 slot_ranges: &[(f64, f64)],
10310) -> Vec<SlotFragment> {
10311 let mut groups: Vec<(usize, Vec<TextChunk>, BoundingBox)> = Vec::new();
10312
10313 for chunk in line
10314 .chunks
10315 .iter()
10316 .filter(|chunk| !chunk.value.trim().is_empty())
10317 .cloned()
10318 {
10319 let slot_idx = assign_chunk_to_slot(&chunk.bbox, slot_ranges);
10320 if let Some((prev_slot, prev_chunks, prev_bbox)) = groups.last_mut() {
10321 let gap = chunk.bbox.left_x - prev_bbox.right_x;
10322 if *prev_slot == slot_idx && gap <= chunk.font_size.max(6.0) * 2.4 {
10323 *prev_bbox = prev_bbox.union(&chunk.bbox);
10324 prev_chunks.push(chunk);
10325 continue;
10326 }
10327 }
10328 groups.push((slot_idx, vec![chunk.clone()], chunk.bbox.clone()));
10329 }
10330
10331 groups
10332 .into_iter()
10333 .filter_map(|(slot_idx, chunks, bbox)| {
10334 let text = normalize_common_ocr_text(
10335 &crate::models::text::TextLine::concatenate_chunks(&chunks),
10336 );
10337 if text.trim().is_empty() {
10338 None
10339 } else {
10340 Some(SlotFragment {
10341 slot_idx,
10342 bbox,
10343 text,
10344 })
10345 }
10346 })
10347 .collect()
10348}
10349
10350fn assign_chunk_to_slot(bbox: &BoundingBox, slot_ranges: &[(f64, f64)]) -> usize {
10351 let mut best_idx = 0usize;
10352 let mut best_overlap = f64::NEG_INFINITY;
10353 let center_x = bbox.center_x();
10354
10355 for (idx, (left, right)) in slot_ranges.iter().enumerate() {
10356 let overlap = (bbox.right_x.min(*right) - bbox.left_x.max(*left)).max(0.0);
10357 let score = if overlap > 0.0 {
10358 overlap / bbox.width().max(1.0)
10359 } else {
10360 -((center_x - ((*left + *right) / 2.0)).abs())
10361 };
10362 if score > best_overlap {
10363 best_overlap = score;
10364 best_idx = idx;
10365 }
10366 }
10367
10368 best_idx
10369}
10370
10371fn append_cell_text(cell: &mut String, fragment: &str) {
10372 let trimmed = fragment.trim();
10373 if trimmed.is_empty() {
10374 return;
10375 }
10376 if !cell.is_empty() {
10377 cell.push(' ');
10378 }
10379 cell.push_str(trimmed);
10380}
10381
10382fn normalize_leading_stub_header(rows: &mut [Vec<String>]) {
10383 if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
10384 return;
10385 }
10386
10387 if !rows[0][0].trim().is_empty() || rows[1][0].trim().is_empty() {
10388 return;
10389 }
10390
10391 let first_row_filled = rows[0]
10392 .iter()
10393 .skip(1)
10394 .filter(|cell| !cell.trim().is_empty())
10395 .count();
10396 let second_row_filled = rows[1]
10397 .iter()
10398 .skip(1)
10399 .filter(|cell| !cell.trim().is_empty())
10400 .count();
10401 if first_row_filled < 2 || second_row_filled < 2 {
10402 return;
10403 }
10404
10405 rows[0][0] = rows[1][0].trim().to_string();
10406 rows[1][0].clear();
10407}
10408
10409fn promote_embedded_stub_header(header_rows: &mut [Vec<String>], table_rows: &[Vec<String>]) {
10410 let Some(header_row) = header_rows.first_mut() else {
10411 return;
10412 };
10413 let Some(first_body_row) = table_rows.first() else {
10414 return;
10415 };
10416 if header_row.is_empty() || first_body_row.is_empty() {
10417 return;
10418 }
10419 if !header_row[0].trim().is_empty() {
10420 return;
10421 }
10422
10423 let promoted = first_body_row[0].trim();
10424 if promoted.is_empty() || promoted.split_whitespace().count() > 3 || promoted.len() > 24 {
10425 return;
10426 }
10427
10428 let header_fill = header_row
10429 .iter()
10430 .skip(1)
10431 .filter(|cell| !cell.trim().is_empty())
10432 .count();
10433 let body_fill = first_body_row
10434 .iter()
10435 .skip(1)
10436 .filter(|cell| !cell.trim().is_empty())
10437 .count();
10438 if header_fill < header_row.len().saturating_sub(1)
10439 || body_fill < first_body_row.len().saturating_sub(1)
10440 {
10441 return;
10442 }
10443
10444 header_row[0] = promoted.to_string();
10445}
10446
10447fn should_merge_panel_body_rows(rows: &[Vec<String>]) -> bool {
10448 rows.len() >= 3
10449 && rows
10450 .iter()
10451 .all(|row| !row.is_empty() && row.iter().all(|cell| !cell.trim().is_empty()))
10452}
10453
10454fn merge_panel_body_row(
10455 table_rows: &[Vec<String>],
10456 trailing_rows: &[Vec<String>],
10457 slot_count: usize,
10458) -> Vec<String> {
10459 let mut merged = vec![String::new(); slot_count];
10460 for row in table_rows {
10461 for (col_idx, cell) in row.iter().enumerate() {
10462 if col_idx + 1 >= slot_count {
10463 break;
10464 }
10465 append_cell_text(&mut merged[col_idx + 1], cell);
10466 }
10467 }
10468 for row in trailing_rows {
10469 for (col_idx, cell) in row.iter().enumerate() {
10470 if col_idx >= slot_count {
10471 break;
10472 }
10473 append_cell_text(&mut merged[col_idx], cell);
10474 }
10475 }
10476 merged
10477}
10478
10479fn render_pipe_rows(rows: &[Vec<String>]) -> String {
10480 if rows.is_empty() {
10481 return String::new();
10482 }
10483
10484 let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
10485 if num_cols == 0 {
10486 return String::new();
10487 }
10488
10489 let mut out = String::new();
10490 for (row_idx, row) in rows.iter().enumerate() {
10491 out.push('|');
10492 for col_idx in 0..num_cols {
10493 let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
10494 out.push_str(&format!(" {} |", cell.trim()));
10495 }
10496 out.push('\n');
10497
10498 if row_idx == 0 {
10499 out.push('|');
10500 for _ in 0..num_cols {
10501 out.push_str(" --- |");
10502 }
10503 out.push('\n');
10504 }
10505 }
10506 out.push('\n');
10507 out
10508}
10509
10510fn render_html_table(rows: &[Vec<String>]) -> String {
10511 if rows.is_empty() {
10512 return String::new();
10513 }
10514
10515 let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
10516 if num_cols == 0 {
10517 return String::new();
10518 }
10519
10520 let mut out = String::from("<table>\n");
10521 for (row_idx, row) in rows.iter().enumerate() {
10522 out.push_str("<tr>");
10523 for col_idx in 0..num_cols {
10524 let cell = escape_html_text(row.get(col_idx).map(String::as_str).unwrap_or("").trim());
10525 if row_idx == 0 {
10526 out.push_str("<th>");
10527 out.push_str(&cell);
10528 out.push_str("</th>");
10529 } else {
10530 out.push_str("<td>");
10531 out.push_str(&cell);
10532 out.push_str("</td>");
10533 }
10534 }
10535 out.push_str("</tr>\n");
10536 }
10537 out.push_str("</table>\n\n");
10538 out
10539}
10540
10541fn escape_html_text(text: &str) -> String {
10542 text.replace('&', "&")
10543 .replace('<', "<")
10544 .replace('>', ">")
10545 .replace('"', """)
10546 .replace('\'', "'")
10547}
10548
10549fn normalized_numeric_marker(text: &str) -> Option<String> {
10550 let digits = text
10551 .chars()
10552 .filter(|ch| ch.is_ascii_digit())
10553 .collect::<String>();
10554 (!digits.is_empty() && digits.len() <= 2).then_some(digits)
10555}
10556
10557fn render_infographic_card_rows(rows: &[Vec<String>]) -> Option<String> {
10558 if rows.is_empty() || !rows.iter().all(|row| row.len() == 2) {
10559 return None;
10560 }
10561
10562 let marker = normalized_numeric_marker(rows[0][0].trim())?;
10563 if rows[0][1].split_whitespace().count() < 4 {
10564 return None;
10565 }
10566 if rows
10567 .iter()
10568 .skip(1)
10569 .any(|row| normalized_numeric_marker(row[0].trim()).is_some())
10570 {
10571 return None;
10572 }
10573 if rows
10574 .iter()
10575 .skip(1)
10576 .any(|row| !row[0].trim().is_empty() && row[0].trim().len() > 2)
10577 {
10578 return None;
10579 }
10580
10581 let body = rows
10582 .iter()
10583 .filter_map(|row| row.get(1))
10584 .map(|cell| cell.trim())
10585 .filter(|cell| !cell.is_empty())
10586 .collect::<Vec<_>>()
10587 .join(" ");
10588 if body.split_whitespace().count() < 8 {
10589 return None;
10590 }
10591
10592 Some(format!("{marker}. {body}\n\n"))
10593}
10594
10595fn extract_element_text(element: &ContentElement) -> String {
10596 match element {
10597 ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
10598 ContentElement::Heading(h) => clean_paragraph_text(&h.base.base.value()),
10599 ContentElement::NumberHeading(nh) => clean_paragraph_text(&nh.base.base.base.value()),
10600 ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
10601 ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
10602 _ => String::new(),
10603 }
10604}
10605
10606fn collect_table_border_rows(table: &crate::models::table::TableBorder) -> Vec<Vec<String>> {
10608 let num_cols = table.num_columns.max(1);
10609 let mut rendered_rows: Vec<Vec<String>> = Vec::new();
10610 for row in &table.rows {
10611 let cell_texts: Vec<String> = (0..num_cols)
10612 .map(|col| {
10613 row.cells
10614 .iter()
10615 .find(|c| c.col_number == col)
10616 .map(cell_text_content)
10617 .unwrap_or_default()
10618 })
10619 .collect();
10620 if !cell_texts.iter().all(|t| t.trim().is_empty()) {
10621 rendered_rows.push(cell_texts);
10622 }
10623 }
10624 rendered_rows
10625}
10626
10627fn render_table_border(out: &mut String, table: &crate::models::table::TableBorder) {
10633 if table.rows.is_empty() {
10634 return;
10635 }
10636
10637 let mut rendered_rows = collect_table_border_rows(table);
10639
10640 if rendered_rows.is_empty() {
10641 return;
10642 }
10643
10644 if let Some(rendered) = render_infographic_card_rows(&rendered_rows) {
10645 out.push_str(&rendered);
10646 return;
10647 }
10648
10649 merge_continuation_rows(&mut rendered_rows);
10651 trim_leading_table_carryover_rows(&mut rendered_rows);
10652
10653 if is_toc_table(&rendered_rows) {
10655 render_toc_rows(out, &rendered_rows);
10656 return;
10657 }
10658
10659 out.push_str(&render_pipe_rows(&rendered_rows));
10660}
10661
10662fn is_page_number_like(text: &str) -> bool {
10664 let t = text.trim();
10665 if t.is_empty() {
10666 return false;
10667 }
10668 if t.len() <= 5 && t.chars().all(|c| c.is_ascii_digit()) {
10670 return true;
10671 }
10672 let lower = t.to_ascii_lowercase();
10674 if lower.len() <= 10 && lower.chars().all(|c| "ivxlcdm".contains(c)) {
10675 return true;
10676 }
10677 false
10678}
10679
10680fn is_toc_table(rows: &[Vec<String>]) -> bool {
10683 if rows.is_empty() {
10684 return false;
10685 }
10686 if rows.len() < 2 {
10688 return false;
10689 }
10690 if !rows.iter().all(|r| r.len() == 2) {
10692 return false;
10693 }
10694
10695 let non_empty_right = rows.iter().filter(|r| !r[1].trim().is_empty()).count();
10696 if non_empty_right < 2 {
10697 return false;
10698 }
10699
10700 let page_like = rows.iter().filter(|r| is_page_number_like(&r[1])).count();
10701 page_like >= 2 && page_like * 10 >= non_empty_right * 9 && page_like * 2 >= rows.len()
10702}
10703
10704fn render_toc_rows(out: &mut String, rows: &[Vec<String>]) {
10706 for row in rows {
10707 let title = row[0].trim();
10708 let page = row[1].trim();
10709 if title.is_empty() && page.is_empty() {
10710 continue;
10711 }
10712 if !title.is_empty() && !page.is_empty() {
10713 out.push_str(title);
10714 out.push(' ');
10715 out.push_str(page);
10716 } else {
10717 out.push_str(title);
10718 out.push_str(page);
10719 }
10720 out.push('\n');
10721 }
10722 out.push('\n');
10723}
10724
10725fn cell_text_content(cell: &crate::models::table::TableBorderCell) -> String {
10727 if !cell.content.is_empty() {
10731 let chunks: Vec<_> = cell.content.iter().map(|t| t.base.clone()).collect();
10732 return normalize_common_ocr_text(&crate::models::text::TextLine::concatenate_chunks(
10733 &chunks,
10734 ));
10735 }
10736 let mut text = String::new();
10738 for elem in &cell.contents {
10739 match elem {
10740 ContentElement::Paragraph(p) => text.push_str(&p.base.value()),
10741 ContentElement::TextBlock(tb) => text.push_str(&tb.value()),
10742 ContentElement::TextLine(tl) => text.push_str(&tl.value()),
10743 ContentElement::TextChunk(tc) => text.push_str(&tc.value),
10744 _ => {}
10745 }
10746 }
10747 normalize_common_ocr_text(&repair_fragmented_words(&text))
10748}
10749
10750fn merge_adjacent_pipe_tables(markdown: &str) -> String {
10758 let lines: Vec<&str> = markdown.lines().collect();
10759 if lines.len() < 4 {
10760 return markdown.to_string();
10761 }
10762
10763 fn count_pipe_cols(line: &str) -> usize {
10764 let t = line.trim();
10765 if !t.starts_with('|') || !t.ends_with('|') {
10766 return 0;
10767 }
10768 t.split('|').count().saturating_sub(2)
10769 }
10770
10771 fn is_separator(line: &str) -> bool {
10772 let t = line.trim();
10773 if !t.starts_with('|') || !t.ends_with('|') {
10774 return false;
10775 }
10776 let cells: Vec<&str> = t.split('|').collect();
10777 if cells.len() < 3 {
10778 return false;
10779 }
10780 cells[1..cells.len() - 1].iter().all(|c| {
10781 let s = c.trim();
10782 !s.is_empty() && s.chars().all(|ch| ch == '-' || ch == ':')
10783 })
10784 }
10785
10786 fn is_pipe_row(line: &str) -> bool {
10787 let t = line.trim();
10788 t.starts_with('|') && t.ends_with('|') && t.len() > 2
10789 }
10790
10791 fn pipe_cells(line: &str) -> Vec<String> {
10792 let t = line.trim();
10793 if !is_pipe_row(t) {
10794 return Vec::new();
10795 }
10796 let parts = t.split('|').collect::<Vec<_>>();
10797 parts[1..parts.len() - 1]
10798 .iter()
10799 .map(|cell| cell.trim().to_string())
10800 .collect()
10801 }
10802
10803 fn normalize_header_cell(cell: &str) -> String {
10804 cell.chars()
10805 .filter(|ch| ch.is_alphanumeric())
10806 .flat_map(|ch| ch.to_lowercase())
10807 .collect()
10808 }
10809
10810 fn looks_like_header_row(line: &str) -> bool {
10811 let cells = pipe_cells(line);
10812 if cells.len() < 2 {
10813 return false;
10814 }
10815
10816 let non_empty = cells
10817 .iter()
10818 .filter(|cell| !cell.trim().is_empty())
10819 .collect::<Vec<_>>();
10820 if non_empty.len() < 2 {
10821 return false;
10822 }
10823
10824 let headerish = non_empty.iter().all(|cell| {
10825 let trimmed = cell.trim();
10826 let word_count = trimmed.split_whitespace().count();
10827 let has_alpha = trimmed.chars().any(|ch| ch.is_alphabetic());
10828 has_alpha && word_count <= 4 && trimmed.len() <= 28
10829 });
10830 headerish
10831 }
10832
10833 fn header_overlap_ratio(left: &str, right: &str) -> f64 {
10834 let left_cells = pipe_cells(left)
10835 .into_iter()
10836 .map(|cell| normalize_header_cell(&cell))
10837 .collect::<Vec<_>>();
10838 let right_cells = pipe_cells(right)
10839 .into_iter()
10840 .map(|cell| normalize_header_cell(&cell))
10841 .collect::<Vec<_>>();
10842 let width = left_cells.len().min(right_cells.len());
10843 if width == 0 {
10844 return 0.0;
10845 }
10846
10847 let matches = (0..width)
10848 .filter(|idx| {
10849 !left_cells[*idx].is_empty()
10850 && !right_cells[*idx].is_empty()
10851 && left_cells[*idx] == right_cells[*idx]
10852 })
10853 .count();
10854 matches as f64 / width as f64
10855 }
10856
10857 fn header_schema_matches(left: &str, right: &str) -> bool {
10858 let left_cells = pipe_cells(left)
10859 .into_iter()
10860 .map(|cell| normalize_header_cell(&cell))
10861 .collect::<Vec<_>>();
10862 let right_cells = pipe_cells(right)
10863 .into_iter()
10864 .map(|cell| normalize_header_cell(&cell))
10865 .collect::<Vec<_>>();
10866 if left_cells.len() != right_cells.len() || left_cells.len() < 2 {
10867 return false;
10868 }
10869
10870 let mut aligned_non_empty = 0usize;
10871 for (left, right) in left_cells.iter().zip(right_cells.iter()) {
10872 if left.is_empty() || right.is_empty() {
10873 continue;
10874 }
10875 aligned_non_empty += 1;
10876 if left != right {
10877 return false;
10878 }
10879 }
10880
10881 aligned_non_empty >= 2
10882 }
10883
10884 fn pad_pipe_row(line: &str, target_cols: usize) -> String {
10885 let t = line.trim();
10886 let current_cols = count_pipe_cols(t);
10887 if current_cols >= target_cols {
10888 return t.to_string();
10889 }
10890 let mut result = t.to_string();
10892 for _ in current_cols..target_cols {
10893 result.push_str(" |");
10894 }
10895 result
10896 }
10897
10898 struct Block {
10900 start: usize,
10901 sep: usize,
10902 end: usize, cols: usize,
10904 }
10905
10906 let mut blocks: Vec<Block> = Vec::new();
10907 let mut i = 0;
10908 while i < lines.len() {
10909 if i + 1 < lines.len() && is_pipe_row(lines[i]) && is_separator(lines[i + 1]) {
10910 let cols = count_pipe_cols(lines[i]);
10911 let sep = i + 1;
10912 let mut end = sep;
10913 let mut j = sep + 1;
10914 while j < lines.len() && is_pipe_row(lines[j]) && !is_separator(lines[j]) {
10915 end = j;
10916 j += 1;
10917 }
10918 blocks.push(Block {
10919 start: i,
10920 sep,
10921 end,
10922 cols,
10923 });
10924 i = end + 1;
10925 } else {
10926 i += 1;
10927 }
10928 }
10929
10930 if blocks.len() < 2 {
10931 return markdown.to_string();
10932 }
10933
10934 let mut merge_leader: Vec<Option<usize>> = vec![None; blocks.len()];
10940 let mut group_cols: Vec<usize> = blocks.iter().map(|b| b.cols).collect();
10941 for bi in 1..blocks.len() {
10942 let prev = &blocks[bi - 1];
10943 let curr = &blocks[bi];
10944 let gap_range = prev.end + 1..curr.start;
10945 let gap_all_blank = gap_range.clone().all(|li| lines[li].trim().is_empty());
10946 let leader_idx = merge_leader[bi - 1].unwrap_or(bi - 1);
10950 let effective_prev_cols = group_cols[leader_idx];
10951 let gap_heading_only = if !gap_all_blank && effective_prev_cols >= 2 && curr.cols >= 2 {
10952 let non_blank: Vec<usize> = gap_range
10953 .clone()
10954 .filter(|li| !lines[*li].trim().is_empty())
10955 .collect();
10956 !non_blank.is_empty()
10958 && non_blank.len() <= 2
10959 && non_blank.iter().all(|li| {
10960 let t = lines[*li].trim();
10961 t.starts_with('#') && t.len() < 100
10962 })
10963 } else {
10964 false
10965 };
10966 let gap_short_fragment =
10970 if !gap_all_blank && !gap_heading_only && effective_prev_cols >= 2 && curr.cols >= 2 {
10971 let non_blank: Vec<usize> = gap_range
10972 .clone()
10973 .filter(|li| !lines[*li].trim().is_empty())
10974 .collect();
10975 non_blank.len() == 1 && {
10976 let t = lines[non_blank[0]].trim();
10977 t.len() < 30
10978 && !t.starts_with('#')
10979 && !t.starts_with('-')
10980 && !t.starts_with('*')
10981 && !t.contains(':')
10982 && !t.contains("TABLE")
10983 }
10984 } else {
10985 false
10986 };
10987 let prev_has_header = looks_like_header_row(lines[prev.start]);
10988 let curr_has_header = curr.end >= curr.sep + 2 && looks_like_header_row(lines[curr.start]);
10989 let curr_has_distinct_header = prev_has_header
10990 && curr_has_header
10991 && !header_schema_matches(lines[prev.start], lines[curr.start])
10992 && (curr.cols != prev.cols
10993 || header_overlap_ratio(lines[prev.start], lines[curr.start]) < 1.0);
10994
10995 if (gap_all_blank || gap_heading_only || gap_short_fragment)
10996 && prev.cols > 0
10997 && curr.cols > 0
10998 && !curr_has_distinct_header
10999 {
11000 merge_leader[bi] = Some(leader_idx);
11001 if curr.cols > group_cols[leader_idx] {
11003 group_cols[leader_idx] = curr.cols;
11004 }
11005 }
11006 }
11007
11008 let mut pad_target: Vec<usize> = vec![0; blocks.len()];
11009 for bi in 0..blocks.len() {
11010 let leader = merge_leader[bi].unwrap_or(bi);
11011 pad_target[bi] = group_cols[leader];
11012 }
11013
11014 let mut skip = vec![false; lines.len()];
11018 let mut convert_to_pipe_row = vec![false; lines.len()];
11019 for (bi, leader) in merge_leader.iter().enumerate() {
11020 if leader.is_none() {
11021 continue;
11022 }
11023 let prev_end = blocks[bi - 1].end;
11024 let curr = &blocks[bi];
11025 for li in (prev_end + 1)..curr.start {
11026 if lines[li].trim().is_empty() {
11027 skip[li] = true;
11028 } else {
11029 convert_to_pipe_row[li] = true;
11031 }
11032 }
11033 skip[curr.sep] = true;
11035 }
11036
11037 let mut line_to_block: Vec<Option<usize>> = vec![None; lines.len()];
11039 for (bi, block) in blocks.iter().enumerate() {
11040 line_to_block[block.start..=block.end].fill(Some(bi));
11041 }
11042 for (bi, leader) in merge_leader.iter().enumerate() {
11044 if leader.is_none() {
11045 continue;
11046 }
11047 let prev_end = blocks[bi - 1].end;
11048 let curr = &blocks[bi];
11049 for li in (prev_end + 1)..curr.start {
11050 if convert_to_pipe_row[li] {
11051 line_to_block[li] = Some(bi - 1);
11052 }
11053 }
11054 }
11055
11056 let mut result = String::new();
11057 for (li, line) in lines.iter().enumerate() {
11058 if skip[li] {
11059 continue;
11060 }
11061 if convert_to_pipe_row[li] {
11062 let text = line.trim().trim_start_matches('#').trim();
11064 if let Some(bi) = line_to_block[li] {
11065 let target = pad_target[bi];
11066 if target > 0 && !text.is_empty() {
11067 result.push_str(&format!("| {} ", text));
11068 for _ in 1..target {
11069 result.push_str("| ");
11070 }
11071 result.push_str("|\n");
11072 continue;
11073 }
11074 }
11075 result.push_str(line);
11077 result.push('\n');
11078 continue;
11079 }
11080 if let Some(bi) = line_to_block[li] {
11081 let target = pad_target[bi];
11082 if target > 0 && is_pipe_row(line) && !is_separator(line) {
11083 result.push_str(&pad_pipe_row(line, target));
11084 result.push('\n');
11085 } else if target > 0 && is_separator(line) {
11086 result.push('|');
11087 for _ in 0..target {
11088 result.push_str(" --- |");
11089 }
11090 result.push('\n');
11091 } else {
11092 result.push_str(line);
11093 result.push('\n');
11094 }
11095 } else {
11096 result.push_str(line);
11097 result.push('\n');
11098 }
11099 }
11100
11101 result
11102}
11103
11104#[cfg(test)]
11105mod tests {
11106 use super::*;
11107 use crate::models::bbox::BoundingBox;
11108 use crate::models::chunks::TextChunk;
11109 use crate::models::content::ContentElement;
11110 use crate::models::enums::{PdfLayer, TextFormat, TextType};
11111 use crate::models::list::{ListBody, ListItem, ListLabel, PDFList};
11112 use crate::models::semantic::{SemanticHeading, SemanticParagraph, SemanticTextNode};
11113 use crate::models::table::{
11114 TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
11115 };
11116 use crate::models::text::{TextBlock, TextColumn, TextLine};
11117
11118 #[test]
11119 fn test_empty_doc() {
11120 let doc = PdfDocument::new("test.pdf".to_string());
11121 let md = to_markdown(&doc).unwrap();
11122 assert!(md.contains("No content extracted"));
11123 }
11124
11125 #[test]
11126 fn test_with_title() {
11127 let mut doc = PdfDocument::new("test.pdf".to_string());
11128 doc.title = Some("My Title".to_string());
11129 let md = to_markdown(&doc).unwrap();
11130 assert!(md.starts_with("# My Title\n"));
11131 }
11132
11133 #[test]
11134 fn test_empty_title_not_rendered() {
11135 let mut doc = PdfDocument::new("test.pdf".to_string());
11136 doc.title = Some(" ".to_string());
11137 let md = to_markdown(&doc).unwrap();
11138 assert!(
11139 !md.contains("# "),
11140 "Empty/whitespace title should not produce a heading"
11141 );
11142 }
11143
11144 #[test]
11145 fn test_repair_fragmented_words() {
11146 assert_eq!(
11147 repair_fragmented_words("Jurisdic tion Fore ign Req uire me nts"),
11148 "Jurisdiction Foreign Requirements"
11149 );
11150 }
11151
11152 #[test]
11153 fn test_normalize_common_ocr_text_repairs_units() {
11154 assert_eq!(
11155 normalize_common_ocr_text("10 ߤL at 37 C and -20 oC"),
11156 "10 μL at 37°C and -20°C"
11157 );
11158 }
11159
11160 #[cfg(not(target_arch = "wasm32"))]
11161 #[test]
11162 fn test_build_layout_anchor_rows_reconstructs_four_column_matrix() {
11163 let lines = vec![
11164 "Key Functions by Main Service Flow".to_string(),
11165 "".to_string(),
11166 " Service Stage Function Name Explanation Expected Benefit".to_string(),
11167 "".to_string(),
11168 " 1. Project creation Project creation and Select document type to automatically run project creation, Pipeline configuration with The intuitive UI environment allows the the person in charge to quickly proceed with".to_string(),
11169 "".to_string(),
11170 " management recommended Modelset and Endpoint deployment the entire process from project creation to deployment, improving work efficiency".to_string(),
11171 "".to_string(),
11172 " Conveniently manage raw data to be used for OCR Pack and actual date from live".to_string(),
11173 " 2. Data labeling and Data storage management Provides convenient functions for uploading raw data, viewer, and data management".to_string(),
11174 " (search using image metadata, sorting, filtering, hashtags settings on image data) service".to_string(),
11175 " fine-tuning".to_string(),
11176 " Image data bookmark for Qualitative Evaluation".to_string(),
11177 "".to_string(),
11178 " Create and manage Labeling Creating a Labeling Space to manage raw data annotation, managing labeling resources Labeling work can be outsourced within the pack. Labeled data is continuously".to_string(),
11179 " (Ontology, Characters to be Recognized), data set dump, data set version management supplied from which data sets can be created with ease. The Auto Labeling function".to_string(),
11180 " Space".to_string(),
11181 " 3 increases both efficiency and convenience.".to_string(),
11182 " Various basic models for each selected 5".to_string(),
11183 " document, information comparison between".to_string(),
11184 " Model training Providing a foundation for customers to implement, manage, and upgrade their own".to_string(),
11185 " models, basic model training, training pause function, re-training, cancel function, and OCR model specialized to the customers’ needs".to_string(),
11186 " configuration support for Characters to be Recognized and Ontology that is frequently".to_string(),
11187 " modified while developing specialized models".to_string(),
11188 ];
11189
11190 let header = find_layout_header_candidate(&lines).unwrap();
11191 let rows =
11192 build_layout_anchor_rows(&lines, &extract_layout_entries(&lines, &header)).unwrap();
11193
11194 assert_eq!(
11195 header.headers,
11196 vec![
11197 "Service Stage".to_string(),
11198 "Function Name".to_string(),
11199 "Explanation".to_string(),
11200 "Expected Benefit".to_string()
11201 ]
11202 );
11203 assert_eq!(rows.len(), 4);
11204 assert_eq!(rows[0][0], "1. Project creation");
11205 assert_eq!(rows[0][1], "Project creation and management");
11206 assert!(rows[1][0].contains("fine-tuning"));
11207 assert_eq!(rows[2][1], "Create and manage Labeling Space");
11208 assert_eq!(rows[3][1], "Model training");
11209 assert!(rows[3][2].contains("Various basic models for each selected document"));
11210 }
11211
11212 #[cfg(not(target_arch = "wasm32"))]
11213 #[test]
11214 fn test_build_layout_panel_stub_rows_reconstructs_left_stub_table() {
11215 let lines = vec![
11216 "AI Pack".to_string(),
11217 "Upstage offers 3 AI packs that process unstructured information and data".to_string(),
11218 "".to_string(),
11219 " OCR Recommendation Product semantic search".to_string(),
11220 "".to_string(),
11221 " A solution that recognizes characters in an A solution that recommends the best products and A solution that enables semantic search, analyzes and".to_string(),
11222 " image and extracts necessary information contents organizes key information in unstructured text data".to_string(),
11223 " Pack".to_string(),
11224 " into a standardized form (DB)".to_string(),
11225 "".to_string(),
11226 " Applicable to all fields that require text extraction Applicable to all fields that use any form of Applicable to all fields that deal with various types of".to_string(),
11227 " from standardized documents, such as receipts, recommendation including alternative products, unstructured data containing text information that".to_string(),
11228 "Application bills, credit cards, ID cards, certificates, and medical products and contents that are likely to be require semantic search and conversion into a DB".to_string(),
11229 " receipts purchased next".to_string(),
11230 "".to_string(),
11231 " Achieved 1st place in the OCR World Competition Team with specialists and technologies that Creation of the first natural language evaluation".to_string(),
11232 " The team includes specialists who have received Kaggle’s Gold Medal recommendation system in Korean (KLUE)".to_string(),
11233 " presented 14 papers in the world’s most (Education platform) World’s No.1 in Kaggle text embedding competition in".to_string(),
11234 " Highlight".to_string(),
11235 " renowned AI conferences Proven superior performance of more than 170% E-commerce subject (Shopee)".to_string(),
11236 " compared to other global top-tier recommendation".to_string(),
11237 " models".to_string(),
11238 ];
11239
11240 let header = find_layout_panel_header_candidate(&lines).unwrap();
11241 let rows = build_layout_panel_stub_rows(&lines, &header).unwrap();
11242
11243 assert_eq!(
11244 header.headers,
11245 vec![
11246 "OCR".to_string(),
11247 "Recommendation".to_string(),
11248 "Product semantic search".to_string()
11249 ]
11250 );
11251 assert_eq!(rows.len(), 3);
11252 assert_eq!(rows[0][0], "Pack");
11253 assert!(rows[0][1].contains("image and extracts necessary information"));
11254 assert_eq!(rows[1][0], "Application");
11255 assert!(rows[1][3].contains("require semantic search and conversion into a DB"));
11256 assert_eq!(rows[2][0], "Highlight");
11257 assert!(rows[2][2].contains("top-tier recommendation models"));
11258 }
11259
11260 #[cfg(not(target_arch = "wasm32"))]
11261 #[test]
11262 fn test_extract_layout_toc_entries_merges_wrapped_entry() {
11263 let lines = vec![
11264 "Table of Contents".to_string(),
11265 "".to_string(),
11266 "Executive Summary 4".to_string(),
11267 "Legal Framework 6".to_string(),
11268 "Election Administration 11".to_string(),
11269 "Civil Society Engagement 15".to_string(),
11270 "Political Parties, Candidates Registration and Election 18".to_string(),
11271 "Campaign".to_string(),
11272 "Media Freedom and Access to Information 25".to_string(),
11273 "Voter Education and Awareness 29".to_string(),
11274 "Participation of Marginalized Sectors 31".to_string(),
11275 "Recommendations 39".to_string(),
11276 ];
11277
11278 let (title, entries) = extract_layout_toc_entries(&lines).unwrap();
11279 assert_eq!(title, "Table of Contents");
11280 assert_eq!(entries.len(), 9);
11281 assert_eq!(entries[0].title, "Executive Summary");
11282 assert_eq!(entries[0].page, "4");
11283 assert_eq!(
11284 entries[4].title,
11285 "Political Parties, Candidates Registration and Election Campaign"
11286 );
11287 assert_eq!(entries[4].page, "18");
11288 }
11289
11290 #[cfg(not(target_arch = "wasm32"))]
11291 fn make_bbox_layout_line(words: &[(&str, f64, f64)], bottom: f64, top: f64) -> BBoxLayoutLine {
11292 make_bbox_layout_line_in_block(0, words, bottom, top)
11293 }
11294
11295 #[cfg(not(target_arch = "wasm32"))]
11296 fn make_bbox_layout_line_in_block(
11297 block_id: usize,
11298 words: &[(&str, f64, f64)],
11299 bottom: f64,
11300 top: f64,
11301 ) -> BBoxLayoutLine {
11302 BBoxLayoutLine {
11303 block_id,
11304 bbox: BoundingBox::new(
11305 Some(1),
11306 words.first().map(|(_, left, _)| *left).unwrap_or(72.0),
11307 bottom,
11308 words.last().map(|(_, _, right)| *right).unwrap_or(320.0),
11309 top,
11310 ),
11311 words: words
11312 .iter()
11313 .map(|(text, left, right)| BBoxLayoutWord {
11314 bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
11315 text: (*text).to_string(),
11316 })
11317 .collect(),
11318 }
11319 }
11320
11321 #[cfg(not(target_arch = "wasm32"))]
11322 #[test]
11323 fn test_detect_layout_open_plate_recovers_two_column_species_rows() {
11324 let lines = vec![
11325 make_bbox_layout_line(
11326 &[
11327 ("Fish", 60.0, 76.0),
11328 ("species", 78.0, 107.0),
11329 ("on", 109.0, 119.0),
11330 ("IUCN", 121.0, 142.0),
11331 ("Red", 144.0, 159.0),
11332 ("List", 161.0, 176.0),
11333 ],
11334 649.0,
11335 660.0,
11336 ),
11337 make_bbox_layout_line(
11338 &[("Potosi", 60.0, 84.0), ("Pupfish", 86.0, 114.0)],
11339 632.0,
11340 643.0,
11341 ),
11342 make_bbox_layout_line(
11343 &[("Cyprinodon", 132.0, 176.0), ("alvarezi", 178.0, 207.0)],
11344 632.0,
11345 643.0,
11346 ),
11347 make_bbox_layout_line(
11348 &[
11349 ("La", 60.0, 69.0),
11350 ("Palma", 71.0, 94.0),
11351 ("Pupfish", 96.0, 124.0),
11352 ("Cyprinodon", 132.0, 176.0),
11353 ("longidorsalis", 178.0, 224.0),
11354 ],
11355 616.0,
11356 627.0,
11357 ),
11358 make_bbox_layout_line(
11359 &[("Butterfly", 60.0, 94.0), ("Splitfin", 96.0, 123.0)],
11360 600.0,
11361 611.0,
11362 ),
11363 make_bbox_layout_line(
11364 &[("Ameca", 132.0, 156.0), ("splendens", 158.0, 194.0)],
11365 600.0,
11366 611.0,
11367 ),
11368 make_bbox_layout_line(
11369 &[("Golden", 60.0, 88.0), ("Skiffia", 90.0, 113.0)],
11370 584.0,
11371 595.0,
11372 ),
11373 make_bbox_layout_line(
11374 &[("Skiffia", 132.0, 155.0), ("francesae", 158.0, 193.0)],
11375 584.0,
11376 595.0,
11377 ),
11378 make_bbox_layout_line(
11379 &[
11380 ("Table", 56.0, 74.0),
11381 ("6.1:", 76.0, 87.0),
11382 ("Four", 89.0, 105.0),
11383 ("fish", 107.0, 119.0),
11384 ("species", 121.0, 145.0),
11385 ("on", 147.0, 155.0),
11386 ("IUCN", 157.0, 176.0),
11387 ("Red", 178.0, 190.0),
11388 ("List", 192.0, 205.0),
11389 ("held", 279.0, 293.0),
11390 ("in", 295.0, 302.0),
11391 ("public", 304.0, 325.0),
11392 ("aquariums.", 327.0, 365.0),
11393 ],
11394 556.0,
11395 566.0,
11396 ),
11397 ];
11398
11399 let plate = detect_layout_open_plate(576.0, &lines).unwrap();
11400 assert_eq!(plate.heading, "Fish species on IUCN Red List");
11401 assert_eq!(
11402 plate.header_row,
11403 vec![
11404 "Fish species on IUCN Red List".to_string(),
11405 "Scientific name".to_string()
11406 ]
11407 );
11408 assert_eq!(plate.rows.len(), 4);
11409 assert_eq!(
11410 plate.rows[1],
11411 vec![
11412 "La Palma Pupfish".to_string(),
11413 "Cyprinodon longidorsalis".to_string()
11414 ]
11415 );
11416 assert!(plate
11417 .caption
11418 .starts_with("Table 6.1: Four fish species on IUCN Red List"));
11419 }
11420
11421 #[cfg(not(target_arch = "wasm32"))]
11422 #[test]
11423 fn test_extract_layout_narrative_bridge_recovers_left_prose_and_defers_captions() {
11424 let plate = OpenPlateCandidate {
11425 heading: "Fish species on IUCN Red List".to_string(),
11426 header_row: vec![
11427 "Fish species on IUCN Red List".to_string(),
11428 "Scientific name".to_string(),
11429 ],
11430 rows: vec![],
11431 caption: "Table 6.1".to_string(),
11432 cutoff_top_y: 560.0,
11433 };
11434 let lines = vec![
11435 make_bbox_layout_line(
11436 &[
11437 ("Public", 56.0, 83.0),
11438 ("aquariums,", 88.0, 135.0),
11439 ("because", 140.0, 174.0),
11440 ],
11441 509.0,
11442 521.0,
11443 ),
11444 make_bbox_layout_line(
11445 &[
11446 ("of", 180.0, 188.0),
11447 ("their", 194.0, 214.0),
11448 ("in-", 220.0, 233.0),
11449 ],
11450 509.0,
11451 521.0,
11452 ),
11453 make_bbox_layout_line(
11454 &[
11455 ("house", 56.0, 82.0),
11456 ("expertise,", 84.0, 125.0),
11457 ("can", 128.0, 143.0),
11458 ],
11459 495.0,
11460 507.0,
11461 ),
11462 make_bbox_layout_line(
11463 &[("act", 146.0, 159.0), ("quickly", 161.0, 191.0)],
11464 495.0,
11465 507.0,
11466 ),
11467 make_bbox_layout_line_in_block(
11468 1,
11469 &[
11470 ("Figure", 242.0, 265.0),
11471 ("6.3:", 267.0, 280.0),
11472 ("Photo", 282.0, 303.0),
11473 ],
11474 355.0,
11475 366.0,
11476 ),
11477 make_bbox_layout_line_in_block(
11478 1,
11479 &[
11480 ("of", 305.0, 312.0),
11481 ("the", 314.0, 325.0),
11482 ("species.", 327.0, 360.0),
11483 ],
11484 355.0,
11485 366.0,
11486 ),
11487 make_bbox_layout_line(
11488 &[
11489 ("The", 56.0, 73.0),
11490 ("breeding", 77.0, 114.0),
11491 ("colonies", 118.0, 153.0),
11492 ],
11493 330.0,
11494 342.0,
11495 ),
11496 make_bbox_layout_line(
11497 &[
11498 ("of", 157.0, 165.0),
11499 ("the", 169.0, 183.0),
11500 ("Butterfly", 187.0, 224.0),
11501 ("Splitfin", 228.0, 258.0),
11502 ("at", 314.0, 323.0),
11503 ("the", 327.0, 341.0),
11504 ("London", 345.0, 377.0),
11505 ("Zoo", 381.0, 397.0),
11506 ("and", 401.0, 416.0),
11507 ("elsewhere", 420.0, 463.0),
11508 ("serve", 467.0, 489.0),
11509 ("as", 493.0, 502.0),
11510 ("ark", 506.0, 519.0),
11511 ],
11512 330.0,
11513 342.0,
11514 ),
11515 make_bbox_layout_line(
11516 &[
11517 ("Figure", 56.0, 79.0),
11518 ("6.4:", 81.0, 94.0),
11519 ("Lake", 96.0, 116.0),
11520 ("Sturgeon", 118.0, 158.0),
11521 ],
11522 104.0,
11523 116.0,
11524 ),
11525 ];
11526
11527 let bridge = extract_layout_narrative_bridge(576.0, &lines, &plate).unwrap();
11528 assert!(bridge
11529 .bridge_paragraph
11530 .as_deref()
11531 .is_some_and(|text| text.contains("Public aquariums") && text.contains("expertise")));
11532 assert_eq!(bridge.deferred_captions.len(), 2);
11533 assert!(bridge.deferred_captions[0].contains("Figure 6.3:"));
11534 assert!(bridge.deferred_captions[0].contains("species."));
11535 }
11536
11537 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11538 #[test]
11539 fn test_detect_layout_ocr_benchmark_dashboard_on_real_pdf() {
11540 let path =
11541 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000199.pdf");
11542 let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
11543 let dashboard = detect_layout_ocr_benchmark_dashboard(page_width, &lines).unwrap();
11544
11545 assert_eq!(
11546 dashboard.title,
11547 "Base Model Performance Evaluation of Upstage OCR Pack"
11548 );
11549 assert_eq!(dashboard.left_columns.len(), 2);
11550 assert_eq!(
11551 dashboard.left_columns[0],
11552 "Scene (Photographed document image)"
11553 );
11554 assert_eq!(
11555 dashboard.left_rows[0],
11556 vec![
11557 "Company A²".to_string(),
11558 "70.23".to_string(),
11559 "80.41".to_string()
11560 ]
11561 );
11562 assert_eq!(
11563 dashboard.right_rows[0],
11564 vec![
11565 "OCR-Recall³".to_string(),
11566 "73.2".to_string(),
11567 "94.2".to_string(),
11568 "94.1".to_string()
11569 ]
11570 );
11571 assert_eq!(dashboard.right_rows[3][0], "Parsing-F¹");
11572 assert_eq!(dashboard.right_rows[3][1], "68.0");
11573 assert_eq!(dashboard.right_rows[3][2], "82.65");
11574 assert_eq!(dashboard.right_rows[3][3], "82.65");
11575 assert!(!dashboard.definition_notes.is_empty());
11576 assert!(!dashboard.source_notes.is_empty());
11577 }
11578
11579 #[cfg(not(target_arch = "wasm32"))]
11580 #[test]
11581 fn test_split_layout_line_spans_handles_unicode_boundaries() {
11582 let line = "Title “Podcast #EP32: SDGs dan Anak Muda” 2024";
11583 let spans = split_layout_line_spans(line);
11584 assert_eq!(spans.len(), 3);
11585 assert_eq!(spans[0].1, "Title");
11586 assert!(spans[1].1.contains("Podcast #EP32: SDGs dan Anak Muda"));
11587 assert!(spans[1].1.ends_with('”'));
11588 assert!(spans[2].1.ends_with("24"));
11589 }
11590
11591 #[cfg(not(target_arch = "wasm32"))]
11592 #[test]
11593 fn test_render_layout_single_caption_chart_document_on_real_pdf() {
11594 let path =
11595 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000037.pdf");
11596 let doc = PdfDocument {
11597 title: None,
11598 source_path: Some(path.to_string_lossy().to_string()),
11599 number_of_pages: 1,
11600 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11601 .unwrap()
11602 .kids,
11603 ..PdfDocument::new("01030000000037.pdf".to_string())
11604 };
11605 let rendered = render_layout_single_caption_chart_document(&doc).unwrap();
11606 assert!(rendered.contains("# 3. Impact on Business Operations"));
11607 assert!(rendered.contains("## 3.1. Status of Business Operations"));
11608 assert!(rendered.contains("As shown in Figure 3.1.1, the number of MSMEs"));
11609 assert!(
11610 rendered.contains("Figure 3.1.1: Status of operations during each survey phase (%)")
11611 );
11612 assert!(
11613 rendered.contains("lockdown period. In the handicraft/textile sector, 30% of MSMEs")
11614 );
11615 assert!(!rendered.contains("| Lockdown Period |"));
11616 }
11617
11618 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11619 #[test]
11620 fn test_to_markdown_captioned_media_document_on_real_pdf_72() {
11621 let path =
11622 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000072.pdf");
11623 let doc = PdfDocument {
11624 title: None,
11625 source_path: Some(path.to_string_lossy().to_string()),
11626 number_of_pages: 1,
11627 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11628 .unwrap()
11629 .kids,
11630 ..PdfDocument::new("01030000000072.pdf".to_string())
11631 };
11632 let md = to_markdown(&doc).unwrap();
11633 assert!(md.contains("## Diagram 5"), "{md}");
11634 assert!(
11635 md.contains("**Distribution of Komnas HAM’s YouTube Content (2019-2020)**"),
11636 "{md}"
11637 );
11638 assert!(
11639 md.contains(
11640 "As of 1 December 2021, the Komnas HAM’s YouTube channel has 2,290 subscribers"
11641 ),
11642 "{md}"
11643 );
11644 assert!(md.contains("**Figure 4**"), "{md}");
11645 assert!(
11646 md.contains("*Komnas HAM’s YouTube channel as of 1 December 2021*"),
11647 "{md}"
11648 );
11649 }
11650
11651 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11652 #[test]
11653 fn test_to_markdown_captioned_media_document_on_real_pdf_73() {
11654 let path =
11655 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000073.pdf");
11656 let doc = PdfDocument {
11657 title: None,
11658 source_path: Some(path.to_string_lossy().to_string()),
11659 number_of_pages: 1,
11660 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11661 .unwrap()
11662 .kids,
11663 ..PdfDocument::new("01030000000073.pdf".to_string())
11664 };
11665 let md = to_markdown(&doc).unwrap();
11666 assert!(
11667 md.starts_with("# In this content, DPN Argentina provides a brief explanation"),
11668 "{md}"
11669 );
11670 assert!(
11671 md.contains("Examples of such greetings are as follows:"),
11672 "{md}"
11673 );
11674 assert!(md.contains("*Image*"), "{md}");
11675 assert!(md.contains("**Figure 6**"), "{md}");
11676 assert!(md.contains("**DPN Argentina**"), "{md}");
11677 assert!(
11678 md.contains("**Content: World Health Day Celebration (7 April 2021).**^98"),
11679 "{md}"
11680 );
11681 assert!(md.contains("**Footnote:**"), "{md}");
11682 assert!(
11683 md.contains("https://twitter.com/DPNArgentina/status/1379765916259483648."),
11684 "{md}"
11685 );
11686 }
11687
11688 #[cfg(not(target_arch = "wasm32"))]
11689 #[test]
11690 fn test_render_layout_captioned_media_document_does_not_fire_on_real_pdf_14() {
11691 let path =
11692 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
11693 let doc = PdfDocument {
11694 title: None,
11695 source_path: Some(path.to_string_lossy().to_string()),
11696 number_of_pages: 1,
11697 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11698 .unwrap()
11699 .kids,
11700 ..PdfDocument::new("01030000000014.pdf".to_string())
11701 };
11702 assert!(render_layout_captioned_media_document(&doc).is_none());
11703 }
11704
11705 #[cfg(not(target_arch = "wasm32"))]
11706 #[test]
11707 fn test_to_markdown_real_pdf_14_preserves_body_paragraphs() {
11708 let path =
11709 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
11710 let doc = PdfDocument {
11711 title: None,
11712 source_path: Some(path.to_string_lossy().to_string()),
11713 number_of_pages: 1,
11714 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11715 .unwrap()
11716 .kids,
11717 ..PdfDocument::new("01030000000014.pdf".to_string())
11718 };
11719 let md = to_markdown(&doc).unwrap();
11720 assert!(
11721 md.contains("These images also show that different areas are used by men and by women"),
11722 "{md}"
11723 );
11724 }
11725
11726 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11727 #[test]
11728 fn test_render_layout_recommendation_infographic_on_real_pdf() {
11729 let path =
11730 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000183.pdf");
11731 let doc = PdfDocument {
11732 title: None,
11733 source_path: Some(path.to_string_lossy().to_string()),
11734 number_of_pages: 1,
11735 kids: Vec::new(),
11736 ..PdfDocument::new("01030000000183.pdf".to_string())
11737 };
11738 let rendered = render_layout_recommendation_infographic_document(&doc).unwrap();
11739 assert!(rendered.contains("# Recommendation Pack: Track Record"));
11740 assert!(rendered.contains("## Comparison with Beauty Commerce Recommendation Models"));
11741 assert!(rendered.contains("| Graph-RecSys | 0.4048 |"));
11742 assert!(rendered.contains("| Current Service Recommendation Algorithm | 0.159 |"));
11743 assert!(rendered.contains("## Education Content Platform PoC Case"));
11744 assert!(rendered.contains("| DKT Model | 0.882 |"));
11745 assert!(rendered.contains("Compared to regular model"));
11746 }
11747
11748 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11749 #[test]
11750 fn test_render_layout_stacked_bar_report_on_real_pdf() {
11751 let path =
11752 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000038.pdf");
11753 let doc = PdfDocument {
11754 title: None,
11755 source_path: Some(path.to_string_lossy().to_string()),
11756 number_of_pages: 1,
11757 kids: Vec::new(),
11758 ..PdfDocument::new("01030000000038.pdf".to_string())
11759 };
11760 let rendered = render_layout_stacked_bar_report_document(&doc);
11761 if rendered.is_none() {
11762 let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
11763 let blocks = collect_bbox_layout_blocks(&lines);
11764 let figures = collect_layout_figure_captions(&blocks);
11765 let narrative = detect_layout_stacked_bar_narrative(&blocks);
11766 eprintln!("page_width={page_width} figures={}", figures.len());
11767 if let Some(first) = figures.first() {
11768 eprintln!("figure1={}", bbox_layout_block_text(first));
11769 }
11770 if let Some(second) = figures.get(1) {
11771 eprintln!("figure2={}", bbox_layout_block_text(second));
11772 }
11773 eprintln!("narrative={}", narrative.is_some());
11774 if let Some(narrative) = &narrative {
11775 eprintln!("heading={}", narrative.heading);
11776 eprintln!("paragraphs={}", narrative.paragraphs.len());
11777 eprintln!("footnote={:?}", narrative.footnote);
11778 }
11779 for block in &blocks {
11780 let text = bbox_layout_block_text(block);
11781 if text.contains("July")
11782 || text.contains("October")
11783 || text.contains("January")
11784 || text.contains("Will ")
11785 || text.contains("Don’t")
11786 || text.starts_with("6.2.")
11787 || text.starts_with("5.")
11788 {
11789 eprintln!(
11790 "block top={:.1} bottom={:.1} left={:.1} right={:.1} text={}",
11791 block.bbox.top_y,
11792 block.bbox.bottom_y,
11793 block.bbox.left_x,
11794 block.bbox.right_x,
11795 text
11796 );
11797 }
11798 }
11799 if figures.len() >= 2 {
11800 let first = detect_layout_three_month_stacked_figure(
11801 &blocks,
11802 &lines,
11803 page_width,
11804 figures[0].clone(),
11805 figures[1].bbox.top_y,
11806 );
11807 eprintln!("figure_one_ok={}", first.is_some());
11808 if let Some(narrative) = &narrative {
11809 let second = detect_layout_sector_bar_figure(
11810 &blocks,
11811 &lines,
11812 page_width,
11813 figures[1].clone(),
11814 narrative.top_y,
11815 );
11816 eprintln!("figure_two_ok={}", second.is_some());
11817 }
11818 }
11819 }
11820 let rendered = rendered.unwrap();
11821 assert!(rendered.contains("# Figure 6.1.1:"));
11822 assert!(rendered.contains("| Will not terminate employment | 51 | 81 | 73 |"));
11823 assert!(rendered.contains("# 6.2. Expectations for Re-Hiring Employees"));
11824 }
11825
11826 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11827 #[test]
11828 fn test_render_layout_multi_figure_chart_document_on_real_pdf() {
11829 let path =
11830 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000076.pdf");
11831 let doc = PdfDocument {
11832 title: None,
11833 source_path: Some(path.to_string_lossy().to_string()),
11834 number_of_pages: 1,
11835 kids: Vec::new(),
11836 ..PdfDocument::new("01030000000076.pdf".to_string())
11837 };
11838 let rendered = render_layout_multi_figure_chart_document(&doc).unwrap();
11839 assert!(rendered.contains("# Figures from the Document"));
11840 assert!(
11841 rendered.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
11842 );
11843 assert!(rendered.contains("| 2016 | 3,230 |"));
11844 assert!(rendered.contains("| 2021 | 2,693 |"));
11845 assert!(
11846 rendered.contains("## Figure 1.8. Singapore foreign workforce stock (in thousands)")
11847 );
11848 assert!(rendered.contains("| 2016 (Dec) | 1,393 |"));
11849 assert!(rendered.contains("| 2021 (Dec) | 1,200 |"));
11850 assert!(rendered.contains(
11851 "Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate."
11852 ));
11853 }
11854
11855 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11856 #[test]
11857 fn test_render_layout_open_plate_document_on_real_pdf() {
11858 let path =
11859 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
11860 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11861 let rendered = render_layout_open_plate_document(&doc).unwrap();
11862 assert!(rendered.contains("# Fish species on IUCN Red List"));
11863 assert!(rendered.contains("| Potosi Pupfish | Cyprinodon alvarezi |"));
11864 assert!(rendered.contains("| Golden Skiffia | Skiffia francesae |"));
11865 assert!(rendered.contains("*Table 6.1: Four fish species on IUCN Red List"));
11866 assert!(rendered.contains("---"));
11867 assert!(rendered.contains("Public aquariums, because of their inhouse expertise"));
11868 }
11869
11870 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11871 #[test]
11872 fn test_to_markdown_open_plate_document_on_real_pdf() {
11873 let path =
11874 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
11875 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11876 let md = to_markdown(&doc).unwrap();
11877
11878 assert!(md.contains("# Fish species on IUCN Red List"), "{md}");
11879 assert!(
11880 md.contains("| Potosi Pupfish | Cyprinodon alvarezi |"),
11881 "{md}"
11882 );
11883 assert!(
11884 md.contains("| Golden Skiffia | Skiffia francesae |"),
11885 "{md}"
11886 );
11887 assert!(
11888 md.contains("*Table 6.1: Four fish species on IUCN Red List"),
11889 "{md}"
11890 );
11891 assert!(
11892 md.contains("The breeding colonies of the Butterfly Splitfin"),
11893 "{md}"
11894 );
11895 }
11896
11897 #[cfg(not(target_arch = "wasm32"))]
11898 #[test]
11899 fn test_to_markdown_does_not_misclassify_open_plate_pdf_36() {
11900 let path =
11901 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000036.pdf");
11902 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11903 let md = to_markdown(&doc).unwrap();
11904
11905 assert!(md.contains("# 2. General Profile of MSMEs"), "{md}");
11906 assert!(
11907 md.contains("In July 2020, the survey established a general profile"),
11908 "{md}"
11909 );
11910 assert!(
11911 md.contains(
11912 "The tourism sub-sectors interviewed included lodging, restaurants and bars"
11913 ),
11914 "{md}"
11915 );
11916 assert!(
11917 !md.starts_with("# Business characteristics. Business size was"),
11918 "{md}"
11919 );
11920 }
11921
11922 #[cfg(not(target_arch = "wasm32"))]
11923 #[test]
11924 fn test_to_markdown_does_not_misclassify_open_plate_pdf_40() {
11925 let path =
11926 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000040.pdf");
11927 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11928 let md = to_markdown(&doc).unwrap();
11929
11930 assert!(
11931 md.contains(
11932 "Thailand, Philippines and Indonesia in particular, identifying known experts"
11933 ),
11934 "{md}"
11935 );
11936 assert!(
11937 md.contains("Figure 1: Age by gender of respondents"),
11938 "{md}"
11939 );
11940 assert!(md.contains("Gender Analysis of Violent Extremism"), "{md}");
11941 assert!(
11942 !md.starts_with("# Thailand, Philippines and Indonesia in"),
11943 "{md}"
11944 );
11945 }
11946
11947 #[cfg(not(target_arch = "wasm32"))]
11948 #[test]
11949 fn test_to_markdown_does_not_misclassify_open_plate_pdf_64() {
11950 let path =
11951 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000064.pdf");
11952 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11953 let md = to_markdown(&doc).unwrap();
11954
11955 assert!(md.contains("estuarine influenced areas."), "{md}");
11956 assert!(md.contains("| MANILA | 2454 | 6,125 |"), "{md}");
11957 assert!(
11958 md.contains("The port of Manila has been documented"),
11959 "{md}"
11960 );
11961 assert!(!md.starts_with("# CAGAYAN DE ORO"), "{md}");
11962 }
11963
11964 #[cfg(not(target_arch = "wasm32"))]
11965 #[test]
11966 fn test_detect_footnote_citation_regions_on_real_pdf() {
11967 let path =
11968 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
11969 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11970 let regions = detect_footnote_citation_regions(&doc);
11971 assert!(!regions.is_empty(), "{regions:?}");
11972 assert!(
11973 regions.iter().any(|region| {
11974 region.rendered.contains("<table>")
11975 && region.rendered.contains("<td>25</td>")
11976 && region.rendered.contains("<td>29</td>")
11977 }),
11978 "{regions:#?}"
11979 );
11980 assert!(
11981 regions.iter().any(|region| {
11982 region.rendered.contains("<table>")
11983 && region.rendered.contains("<td>30</td>")
11984 && region.rendered.contains("<td>33</td>")
11985 }),
11986 "{regions:#?}"
11987 );
11988 }
11989
11990 #[cfg(not(target_arch = "wasm32"))]
11991 #[test]
11992 fn test_to_markdown_renders_footnote_citation_tables_on_real_pdf() {
11993 let path =
11994 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
11995 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11996 let md = to_markdown(&doc).unwrap();
11997
11998 assert!(md.contains("<table>"), "{md}");
11999 assert!(md.contains("<th>Footnote</th><th>Citation</th>"), "{md}");
12000 assert!(md.contains("<td>25</td><td>Wiliam Beckford"), "{md}");
12001 assert!(
12002 md.contains("<td>29</td><td>Pope, The Rape of the Lock, 69.</td>"),
12003 "{md}"
12004 );
12005 assert!(
12006 md.contains("<td>30</td><td>Beawes, Lex Mercatoria Rediviva, 791.</td>"),
12007 "{md}"
12008 );
12009 assert!(
12010 md.contains("<td>32</td><td>Beawes, Lex Mercatoria Rediviva, 792.</td>"),
12011 "{md}"
12012 );
12013 assert!(
12014 md.contains("<td>33</td><td>M.M., Pharmacopoia Reformata:"),
12015 "{md}"
12016 );
12017 }
12018
12019 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12020 #[test]
12021 fn test_to_markdown_projection_sheet_document_on_real_pdf() {
12022 let path =
12023 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000128.pdf");
12024 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12025 let md = to_markdown(&doc).unwrap();
12026
12027 assert!(md.contains("# Table and Figure from the Document"), "{md}");
12028 assert!(md.contains("| A | B | C | D | E |"), "{md}");
12029 assert!(
12030 md.contains("| 10 | 8 | 19.73214458 | 17.99 | 21.47 |"),
12031 "{md}"
12032 );
12033 assert!(
12034 md.contains("**Figure 13.3. Graph of Projection Estimates**"),
12035 "{md}"
12036 );
12037 assert!(md.contains("[Open Template in Microsoft Excel](#)"), "{md}");
12038 assert!(
12039 md.contains("*298 | Ch. 13. Homogeneous Investment Types*"),
12040 "{md}"
12041 );
12042 }
12043
12044 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12045 #[test]
12046 fn test_to_markdown_appendix_tables_document_on_real_pdf() {
12047 let path =
12048 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000082.pdf");
12049 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12050 let md = to_markdown(&doc).unwrap();
12051
12052 assert!(md.contains("# Appendices"), "{md}");
12053 assert!(
12054 md.contains("## TABLE 28: BREAKDOWN OF IMPRISONMENT CLAUSES IN STATE LAWS"),
12055 "{md}"
12056 );
12057 assert!(md.contains("| Imprisonment terms | Number of clauses | Percentage of all states | Percentage of total |"), "{md}");
12058 assert!(
12059 md.contains("| Less than 3 months | 4,448 | 21.3% | 17.0% |"),
12060 "{md}"
12061 );
12062 assert!(
12063 md.contains("## TABLE 29: STATES WITH MORE THAN 1,000 IMPRISONMENT CLAUSES"),
12064 "{md}"
12065 );
12066 assert!(
12067 md.contains(
12068 "| State | Number of clauses | GSDP (In Rs lakh crore) | GSDP (In $ billion) |"
12069 ),
12070 "{md}"
12071 );
12072 assert!(md.contains("| Gujarat | 1469 | 15.6 | 200.4 |"), "{md}");
12073 assert!(
12074 md.contains("*Sources: TeamLease Regtech, and Reserve Bank of India for GSDPs*"),
12075 "{md}"
12076 );
12077 assert!(md.contains("*Exchange rate: Rs 75 to USD*"), "{md}");
12078 }
12079
12080 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12081 #[test]
12082 fn test_to_markdown_titled_dual_table_document_on_real_pdf() {
12083 let path =
12084 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000084.pdf");
12085 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12086 let md = to_markdown(&doc).unwrap();
12087
12088 assert!(md.starts_with("# Jailed for Doing Business"), "{md}");
12089 assert!(
12090 md.contains("## TABLE 38: THREE CASE STUDIES ON NBFC COMPLIANCES*"),
12091 "{md}"
12092 );
12093 assert!(
12094 md.contains("| Percentage of imprisonment clauses | 20% | 30% | 37% |"),
12095 "{md}"
12096 );
12097 assert!(
12098 md.contains("## TABLE 39: BREAKDOWN OF IMPRISONMENT CLAUSES IN NBFC CASE STUDIES*"),
12099 "{md}"
12100 );
12101 assert!(
12102 md.contains("| 5 years to 10 years | 19 | 19 | 19 |"),
12103 "{md}"
12104 );
12105 assert!(
12106 md.contains("*These are real data from three NBFCs*"),
12107 "{md}"
12108 );
12109 }
12110
12111 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12112 #[test]
12113 fn test_to_markdown_registration_report_document_on_real_pdf() {
12114 let path =
12115 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000047.pdf");
12116 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12117 let md = to_markdown(&doc).unwrap();
12118
12119 assert!(
12120 md.starts_with("# ANFREL Pre-Election Assessment Mission Report"),
12121 "{md}"
12122 );
12123 assert!(
12124 md.contains(
12125 "| 14 | Cambodian Indigeneous Peoples Democracy Party | 19 | 194 | 19 | 202 | +8 |"
12126 ),
12127 "{md}"
12128 );
12129 assert!(
12130 md.contains("| | Total | | 84,208 | | 86,092 | +1,884 |"),
12131 "{md}"
12132 );
12133 assert!(!md.contains("| | Democracy Party |"), "{md}");
12134 }
12135
12136 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
12137 #[test]
12138 fn test_to_markdown_dual_table_article_document_on_real_pdf() {
12139 let path =
12140 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000190.pdf");
12141 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
12142 let md = to_markdown(&doc).unwrap();
12143
12144 assert!(
12145 md.starts_with("# Table 6: Performance comparison amongst the merge candidates"),
12146 "{md}"
12147 );
12148 assert!(
12149 md.contains("*Table 6*: Performance comparison amongst the merge candidates."),
12150 "{md}"
12151 );
12152 assert!(md.contains("# Table 7: Ablation studies on the different merge methods used for obtaining the final model"), "{md}");
12153 assert!(!md.contains("*Table 6*: Table 6:"), "{md}");
12154 assert!(!md.contains("| Merge v1"), "{md}");
12155 }
12156
12157 #[test]
12158 fn test_normalize_list_text_strips_redundant_bullets() {
12159 assert_eq!(
12160 normalize_list_text("• Collected via surveys"),
12161 "Collected via surveys"
12162 );
12163 assert!(is_pure_bullet_marker("•"));
12164 }
12165
12166 #[test]
12167 fn test_reference_continuation_detected() {
12168 assert!(should_merge_paragraph_text(
12169 "Scaling laws for transfer.",
12170 "arXiv preprint arXiv:2102.01293."
12171 ));
12172 }
12173
12174 #[test]
12175 fn test_enumerated_markers_are_detected() {
12176 assert!(starts_with_enumerated_marker("iii. Third item"));
12177 assert!(starts_with_enumerated_marker("1) First item"));
12178 assert!(starts_with_enumerated_marker("a. Lettered item"));
12179 assert!(!starts_with_enumerated_marker("Figure 1. Caption"));
12180 assert!(!starts_with_enumerated_marker("Natural dispersal"));
12181 }
12182
12183 fn make_heading(text: &str) -> ContentElement {
12184 let bbox = BoundingBox::new(Some(1), 72.0, 700.0, 300.0, 712.0);
12185 let chunk = TextChunk {
12186 value: text.to_string(),
12187 bbox: bbox.clone(),
12188 font_name: "Lato-Bold".to_string(),
12189 font_size: 12.0,
12190 font_weight: 700.0,
12191 italic_angle: 0.0,
12192 font_color: "#000000".to_string(),
12193 contrast_ratio: 21.0,
12194 symbol_ends: vec![],
12195 text_format: TextFormat::Normal,
12196 text_type: TextType::Regular,
12197 pdf_layer: PdfLayer::Main,
12198 ocg_visible: true,
12199 index: None,
12200 page_number: Some(1),
12201 level: None,
12202 mcid: None,
12203 };
12204 let line = TextLine {
12205 bbox: bbox.clone(),
12206 index: None,
12207 level: None,
12208 font_size: 12.0,
12209 base_line: 702.0,
12210 slant_degree: 0.0,
12211 is_hidden_text: false,
12212 text_chunks: vec![chunk],
12213 is_line_start: true,
12214 is_line_end: true,
12215 is_list_line: false,
12216 connected_line_art_label: None,
12217 };
12218 let block = TextBlock {
12219 bbox: bbox.clone(),
12220 index: None,
12221 level: None,
12222 font_size: 12.0,
12223 base_line: 702.0,
12224 slant_degree: 0.0,
12225 is_hidden_text: false,
12226 text_lines: vec![line],
12227 has_start_line: true,
12228 has_end_line: true,
12229 text_alignment: None,
12230 };
12231 let column = TextColumn {
12232 bbox: bbox.clone(),
12233 index: None,
12234 level: None,
12235 font_size: 12.0,
12236 base_line: 702.0,
12237 slant_degree: 0.0,
12238 is_hidden_text: false,
12239 text_blocks: vec![block],
12240 };
12241 ContentElement::Heading(SemanticHeading {
12242 base: SemanticParagraph {
12243 base: SemanticTextNode {
12244 bbox,
12245 index: None,
12246 level: None,
12247 semantic_type: crate::models::enums::SemanticType::Heading,
12248 correct_semantic_score: None,
12249 columns: vec![column],
12250 font_weight: Some(700.0),
12251 font_size: Some(12.0),
12252 text_color: None,
12253 italic_angle: None,
12254 font_name: Some("Lato-Bold".to_string()),
12255 text_format: None,
12256 max_font_size: Some(12.0),
12257 background_color: None,
12258 is_hidden_text: false,
12259 },
12260 enclosed_top: false,
12261 enclosed_bottom: false,
12262 indentation: 0,
12263 },
12264 heading_level: Some(1),
12265 })
12266 }
12267
12268 fn make_heading_at(left: f64, bottom: f64, right: f64, top: f64, text: &str) -> ContentElement {
12269 let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
12270 let chunk = TextChunk {
12271 value: text.to_string(),
12272 bbox: bbox.clone(),
12273 font_name: "Lato-Bold".to_string(),
12274 font_size: top - bottom,
12275 font_weight: 700.0,
12276 italic_angle: 0.0,
12277 font_color: "#000000".to_string(),
12278 contrast_ratio: 21.0,
12279 symbol_ends: vec![],
12280 text_format: TextFormat::Normal,
12281 text_type: TextType::Regular,
12282 pdf_layer: PdfLayer::Main,
12283 ocg_visible: true,
12284 index: None,
12285 page_number: Some(1),
12286 level: None,
12287 mcid: None,
12288 };
12289 let line = TextLine {
12290 bbox: bbox.clone(),
12291 index: None,
12292 level: None,
12293 font_size: top - bottom,
12294 base_line: bottom + 2.0,
12295 slant_degree: 0.0,
12296 is_hidden_text: false,
12297 text_chunks: vec![chunk],
12298 is_line_start: true,
12299 is_line_end: true,
12300 is_list_line: false,
12301 connected_line_art_label: None,
12302 };
12303 let block = TextBlock {
12304 bbox: bbox.clone(),
12305 index: None,
12306 level: None,
12307 font_size: top - bottom,
12308 base_line: bottom + 2.0,
12309 slant_degree: 0.0,
12310 is_hidden_text: false,
12311 text_lines: vec![line],
12312 has_start_line: true,
12313 has_end_line: true,
12314 text_alignment: None,
12315 };
12316 let column = TextColumn {
12317 bbox: bbox.clone(),
12318 index: None,
12319 level: None,
12320 font_size: top - bottom,
12321 base_line: bottom + 2.0,
12322 slant_degree: 0.0,
12323 is_hidden_text: false,
12324 text_blocks: vec![block],
12325 };
12326 ContentElement::Heading(SemanticHeading {
12327 base: SemanticParagraph {
12328 base: SemanticTextNode {
12329 bbox,
12330 index: None,
12331 level: None,
12332 semantic_type: crate::models::enums::SemanticType::Heading,
12333 correct_semantic_score: None,
12334 columns: vec![column],
12335 font_weight: Some(700.0),
12336 font_size: Some(top - bottom),
12337 text_color: None,
12338 italic_angle: None,
12339 font_name: Some("Lato-Bold".to_string()),
12340 text_format: None,
12341 max_font_size: Some(top - bottom),
12342 background_color: None,
12343 is_hidden_text: false,
12344 },
12345 enclosed_top: false,
12346 enclosed_bottom: false,
12347 indentation: 0,
12348 },
12349 heading_level: None,
12350 })
12351 }
12352
12353 fn make_paragraph(text: &str, bottom: f64, top: f64) -> ContentElement {
12354 make_paragraph_at(72.0, bottom, 300.0, top, text)
12355 }
12356
12357 fn make_paragraph_at(
12358 left: f64,
12359 bottom: f64,
12360 right: f64,
12361 top: f64,
12362 text: &str,
12363 ) -> ContentElement {
12364 let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
12365 let chunk = TextChunk {
12366 value: text.to_string(),
12367 bbox: bbox.clone(),
12368 font_name: "Lato-Regular".to_string(),
12369 font_size: (top - bottom).max(1.0),
12370 font_weight: 400.0,
12371 italic_angle: 0.0,
12372 font_color: "#000000".to_string(),
12373 contrast_ratio: 21.0,
12374 symbol_ends: vec![],
12375 text_format: TextFormat::Normal,
12376 text_type: TextType::Regular,
12377 pdf_layer: PdfLayer::Main,
12378 ocg_visible: true,
12379 index: None,
12380 page_number: Some(1),
12381 level: None,
12382 mcid: None,
12383 };
12384 let line = TextLine {
12385 bbox: bbox.clone(),
12386 index: None,
12387 level: None,
12388 font_size: chunk.font_size,
12389 base_line: bottom + 2.0,
12390 slant_degree: 0.0,
12391 is_hidden_text: false,
12392 text_chunks: vec![chunk],
12393 is_line_start: true,
12394 is_line_end: true,
12395 is_list_line: false,
12396 connected_line_art_label: None,
12397 };
12398 let block = TextBlock {
12399 bbox: bbox.clone(),
12400 index: None,
12401 level: None,
12402 font_size: line.font_size,
12403 base_line: line.base_line,
12404 slant_degree: 0.0,
12405 is_hidden_text: false,
12406 text_lines: vec![line],
12407 has_start_line: true,
12408 has_end_line: true,
12409 text_alignment: None,
12410 };
12411 let column = TextColumn {
12412 bbox: bbox.clone(),
12413 index: None,
12414 level: None,
12415 font_size: block.font_size,
12416 base_line: block.base_line,
12417 slant_degree: 0.0,
12418 is_hidden_text: false,
12419 text_blocks: vec![block],
12420 };
12421 ContentElement::Paragraph(SemanticParagraph {
12422 base: SemanticTextNode {
12423 bbox,
12424 index: None,
12425 level: None,
12426 semantic_type: crate::models::enums::SemanticType::Paragraph,
12427 correct_semantic_score: None,
12428 columns: vec![column],
12429 font_weight: Some(400.0),
12430 font_size: Some(top - bottom),
12431 text_color: None,
12432 italic_angle: None,
12433 font_name: Some("Lato-Regular".to_string()),
12434 text_format: None,
12435 max_font_size: Some(top - bottom),
12436 background_color: None,
12437 is_hidden_text: false,
12438 },
12439 enclosed_top: false,
12440 enclosed_bottom: false,
12441 indentation: 0,
12442 })
12443 }
12444
12445 fn make_fallback_list(items: &[&str]) -> ContentElement {
12446 let mut list_items = Vec::new();
12447 for (idx, text) in items.iter().enumerate() {
12448 let top = 700.0 - idx as f64 * 18.0;
12449 let bottom = top - 12.0;
12450 let bbox = BoundingBox::new(Some(1), 72.0, bottom, 320.0, top);
12451 list_items.push(ListItem {
12452 bbox: bbox.clone(),
12453 index: None,
12454 level: None,
12455 label: ListLabel {
12456 bbox: bbox.clone(),
12457 content: vec![],
12458 semantic_type: None,
12459 },
12460 body: ListBody {
12461 bbox: bbox.clone(),
12462 content: vec![],
12463 semantic_type: None,
12464 },
12465 label_length: 0,
12466 contents: vec![make_paragraph_at(72.0, bottom, 320.0, top, text)],
12467 semantic_type: None,
12468 });
12469 }
12470
12471 ContentElement::List(PDFList {
12472 bbox: BoundingBox::new(
12473 Some(1),
12474 72.0,
12475 700.0 - items.len() as f64 * 18.0,
12476 320.0,
12477 700.0,
12478 ),
12479 index: None,
12480 level: None,
12481 list_items,
12482 numbering_style: Some("bullets".to_string()),
12483 common_prefix: None,
12484 previous_list_id: None,
12485 next_list_id: None,
12486 })
12487 }
12488
12489 fn make_toc_table(rows: &[(&str, &str)]) -> ContentElement {
12490 let mut table_rows = Vec::new();
12491 for (ri, (title, page)) in rows.iter().enumerate() {
12492 let top = 680.0 - ri as f64 * 18.0;
12493 let bottom = top - 12.0;
12494 let left_bbox = BoundingBox::new(Some(1), 72.0, bottom, 280.0, top);
12495 let right_bbox = BoundingBox::new(Some(1), 320.0, bottom, 360.0, top);
12496 table_rows.push(TableBorderRow {
12497 bbox: BoundingBox::new(Some(1), 72.0, bottom, 360.0, top),
12498 index: None,
12499 level: None,
12500 row_number: ri,
12501 cells: vec![
12502 TableBorderCell {
12503 bbox: left_bbox.clone(),
12504 index: None,
12505 level: None,
12506 row_number: ri,
12507 col_number: 0,
12508 row_span: 1,
12509 col_span: 1,
12510 content: vec![TableToken {
12511 base: TextChunk {
12512 value: (*title).to_string(),
12513 bbox: left_bbox,
12514 font_name: "Lato-Regular".to_string(),
12515 font_size: 10.0,
12516 font_weight: 400.0,
12517 italic_angle: 0.0,
12518 font_color: "#000000".to_string(),
12519 contrast_ratio: 21.0,
12520 symbol_ends: vec![],
12521 text_format: TextFormat::Normal,
12522 text_type: TextType::Regular,
12523 pdf_layer: PdfLayer::Main,
12524 ocg_visible: true,
12525 index: None,
12526 page_number: Some(1),
12527 level: None,
12528 mcid: None,
12529 },
12530 token_type: TableTokenType::Text,
12531 }],
12532 contents: vec![],
12533 semantic_type: None,
12534 },
12535 TableBorderCell {
12536 bbox: right_bbox.clone(),
12537 index: None,
12538 level: None,
12539 row_number: ri,
12540 col_number: 1,
12541 row_span: 1,
12542 col_span: 1,
12543 content: vec![TableToken {
12544 base: TextChunk {
12545 value: (*page).to_string(),
12546 bbox: right_bbox,
12547 font_name: "Lato-Regular".to_string(),
12548 font_size: 10.0,
12549 font_weight: 400.0,
12550 italic_angle: 0.0,
12551 font_color: "#000000".to_string(),
12552 contrast_ratio: 21.0,
12553 symbol_ends: vec![],
12554 text_format: TextFormat::Normal,
12555 text_type: TextType::Regular,
12556 pdf_layer: PdfLayer::Main,
12557 ocg_visible: true,
12558 index: None,
12559 page_number: Some(1),
12560 level: None,
12561 mcid: None,
12562 },
12563 token_type: TableTokenType::Text,
12564 }],
12565 contents: vec![],
12566 semantic_type: None,
12567 },
12568 ],
12569 semantic_type: None,
12570 });
12571 }
12572
12573 ContentElement::TableBorder(TableBorder {
12574 bbox: BoundingBox::new(Some(1), 72.0, 620.0, 360.0, 680.0),
12575 index: None,
12576 level: Some("1".to_string()),
12577 x_coordinates: vec![72.0, 320.0, 360.0],
12578 x_widths: vec![0.0, 0.0, 0.0],
12579 y_coordinates: vec![680.0, 662.0, 644.0, 626.0],
12580 y_widths: vec![0.0, 0.0, 0.0, 0.0],
12581 rows: table_rows,
12582 num_rows: rows.len(),
12583 num_columns: 2,
12584 is_bad_table: false,
12585 is_table_transformer: false,
12586 previous_table: None,
12587 next_table: None,
12588 })
12589 }
12590
12591 #[test]
12592 fn test_contents_document_renders_toc_table_rows() {
12593 let mut doc = PdfDocument::new("contents.pdf".to_string());
12594 doc.kids.push(make_heading("CONTENTS"));
12595 doc.kids.push(make_toc_table(&[
12596 ("Experiment #1: Hydrostatic Pressure", "3"),
12597 ("Experiment #2: Bernoulli's Theorem Demonstration", "13"),
12598 ("Experiment #3: Energy Loss in Pipe Fittings", "24"),
12599 ("Experiment #4: Energy Loss in Pipes", "33"),
12600 ("Experiment #5: Impact of a Jet", "43"),
12601 ("Experiment #6: Orifice and Free Jet Flow", "50"),
12602 ("Experiment #7: Osborne Reynolds' Demonstration", "59"),
12603 ("References", "101"),
12604 ]));
12605
12606 let md = to_markdown(&doc).unwrap();
12607 assert!(md.starts_with("# CONTENTS\n\n"));
12608 assert!(md.contains("- Experiment #1: Hydrostatic Pressure 3\n"));
12609 assert!(md.contains("- Experiment #2: Bernoulli's Theorem Demonstration 13\n"));
12610 assert!(md.contains("- Experiment #7: Osborne Reynolds' Demonstration 59\n"));
12611 assert!(md.contains("- References 101\n"));
12612 }
12613
12614 #[test]
12615 fn test_toc_semantic_paragraphs_render_without_blank_lines() {
12616 let mut doc = PdfDocument::new("toc-semantic.pdf".to_string());
12617 let mut first = make_paragraph(
12618 "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
12619 700.0,
12620 712.0,
12621 );
12622 let mut second = make_paragraph("Section 5.1: The Linear Model 35", 684.0, 696.0);
12623 if let ContentElement::Paragraph(p) = &mut first {
12624 p.base.semantic_type = SemanticType::TableOfContent;
12625 }
12626 if let ContentElement::Paragraph(p) = &mut second {
12627 p.base.semantic_type = SemanticType::TableOfContent;
12628 }
12629 doc.kids.push(first);
12630 doc.kids.push(second);
12631
12632 let md = to_markdown(&doc).unwrap();
12633 assert!(md.contains(
12634 "Part V. Chapter Five - Comparing Associations Between Multiple Variables\nSection 5.1: The Linear Model 35\n"
12635 ));
12636 }
12637
12638 #[test]
12639 fn test_compact_toc_document_renders_without_blank_lines() {
12640 let mut doc = PdfDocument::new("compact-toc.pdf".to_string());
12641 doc.kids.push(make_paragraph(
12642 "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
12643 700.0,
12644 712.0,
12645 ));
12646 doc.kids.push(make_paragraph(
12647 "Section 5.1: The Linear Model 35",
12648 684.0,
12649 696.0,
12650 ));
12651 doc.kids.push(make_paragraph(
12652 "Part VI. Chapter Six - Comparing Three or More Group Means",
12653 668.0,
12654 680.0,
12655 ));
12656 doc.kids.push(make_paragraph(
12657 "Section 6.1: Between Versus Within Group Analyses 49",
12658 652.0,
12659 664.0,
12660 ));
12661 doc.kids.push(make_paragraph(
12662 "Part VII. Chapter Seven - Moderation and Mediation Analyses",
12663 636.0,
12664 648.0,
12665 ));
12666 doc.kids.push(make_paragraph(
12667 "Section 7.1: Mediation and Moderation Models 64",
12668 620.0,
12669 632.0,
12670 ));
12671 doc.kids
12672 .push(make_paragraph("References 101", 604.0, 616.0));
12673 doc.kids.push(make_paragraph(
12674 "Section 8.1: Factor Analysis Definitions 75",
12675 588.0,
12676 600.0,
12677 ));
12678
12679 let md = to_markdown(&doc).unwrap();
12680 assert!(md.contains(
12681 "# Part V. Chapter Five - Comparing Associations Between Multiple Variables\n\n## Section 5.1: The Linear Model"
12682 ));
12683 assert!(md.contains(
12684 "# Part VI. Chapter Six - Comparing Three or More Group Means\n\n## Section 6.1: Between Versus Within Group Analyses"
12685 ));
12686 assert!(md.contains("References 101\n\n## Section 8.1: Factor Analysis Definitions"));
12687 }
12688
12689 #[test]
12690 fn test_merged_caption_and_body_paragraph_renders_as_two_paragraphs() {
12691 let mut doc = PdfDocument::new("caption-body.pdf".to_string());
12692 doc.kids.push(make_paragraph(
12693 "Figure 1. This image shows the Western hemisphere as viewed from space 35,400 kilometers above Earth. (credit: modification of work by R. Stockli, NASA/ GSFC/ NOAA/ USGS) Our nearest astronomical neighbor is Earth's satellite, commonly called the Moon.",
12694 500.0,
12695 540.0,
12696 ));
12697
12698 let md = to_markdown(&doc).unwrap();
12699 assert!(md.contains("USGS)\n\nOur nearest astronomical neighbor"));
12700 }
12701
12702 #[test]
12703 fn test_short_caption_label_merges_with_following_tail_and_body() {
12704 let mut doc = PdfDocument::new("diagram-caption.pdf".to_string());
12705 doc.kids.push(make_paragraph("Diagram 5", 540.0, 552.0));
12706 doc.kids.push(make_paragraph(
12707 "Distribution of Komnas HAM's YouTube Content (2019- 2020) As of 1 December 2021, the channel has 2,290 subscribers and 185,676 total views.",
12708 520.0,
12709 532.0,
12710 ));
12711
12712 let md = to_markdown(&doc).unwrap();
12713 assert!(md.contains(
12714 "Diagram 5\nDistribution of Komnas HAM's YouTube Content (2019- 2020)\n\nAs of 1 December 2021, the channel has 2,290 subscribers"
12715 ));
12716 }
12717
12718 #[test]
12719 fn test_short_caption_label_merges_with_tail_and_year() {
12720 let mut doc = PdfDocument::new("figure-caption.pdf".to_string());
12721 doc.kids.push(make_paragraph("Figure 4", 540.0, 552.0));
12722 doc.kids.push(make_paragraph(
12723 "Komnas HAM's YouTube channel as of 1 December",
12724 520.0,
12725 532.0,
12726 ));
12727 doc.kids.push(make_paragraph("2021", 500.0, 512.0));
12728
12729 let md = to_markdown(&doc).unwrap();
12730 assert!(md.contains("Figure 4\nKomnas HAM's YouTube channel as of 1 December\n2021"));
12731 assert!(!md.contains("\n\n2021"));
12732 }
12733
12734 #[test]
12735 fn test_mid_page_numeric_labels_are_not_dropped_as_page_numbers() {
12736 let mut doc = PdfDocument::new("chart.pdf".to_string());
12737 doc.kids.push(make_paragraph("Figure 1", 760.0, 772.0));
12738 doc.kids.push(make_paragraph("100", 520.0, 528.0));
12739 doc.kids
12740 .push(make_paragraph("Body text continues here.", 400.0, 412.0));
12741 doc.kids.push(make_paragraph("36", 20.0, 28.0));
12742
12743 let md = to_markdown(&doc).unwrap();
12744 assert!(md.contains("100"));
12745 assert!(!md.lines().any(|line| line.trim() == "36"));
12746 }
12747
12748 #[test]
12749 fn test_semantic_paragraphs_are_not_remerged_in_markdown() {
12750 let mut doc = PdfDocument::new("paragraphs.pdf".to_string());
12751 doc.kids.push(make_paragraph(
12752 "First semantic paragraph ends here.",
12753 520.0,
12754 532.0,
12755 ));
12756 doc.kids.push(make_paragraph(
12757 "Second semantic paragraph starts here.",
12758 500.0,
12759 512.0,
12760 ));
12761
12762 let md = to_markdown(&doc).unwrap();
12763 assert!(md.contains(
12764 "First semantic paragraph ends here.\n\nSecond semantic paragraph starts here."
12765 ));
12766 }
12767
12768 #[test]
12769 fn test_lowercase_semantic_paragraph_continuation_is_merged() {
12770 let mut doc = PdfDocument::new("continuation.pdf".to_string());
12771 doc.kids.push(make_paragraph(
12772 "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference",
12773 520.0,
12774 532.0,
12775 ));
12776 doc.kids.push(make_paragraph("of interest.", 500.0, 512.0));
12777
12778 let md = to_markdown(&doc).unwrap();
12779 assert!(md.contains(
12780 "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference of interest."
12781 ));
12782 }
12783
12784 #[test]
12785 fn test_semantic_enumerated_paragraphs_are_not_merged() {
12786 let mut doc = PdfDocument::new("enumerated-paragraphs.pdf".to_string());
12787 doc.kids.push(make_paragraph(
12788 "iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
12789 520.0,
12790 532.0,
12791 ));
12792 doc.kids.push(make_paragraph(
12793 "iv. This business model will be operating cost-oriented not capital cost-oriented.",
12794 500.0,
12795 512.0,
12796 ));
12797
12798 let md = to_markdown(&doc).unwrap();
12799 assert!(md.contains(
12800 "iii. Looking at cost items, the cost of raw woods procurement will be highest share.\n\niv. This business model will be operating cost-oriented not capital cost-oriented."
12801 ));
12802 }
12803
12804 #[test]
12805 fn test_leading_figure_carryover_is_skipped_before_first_numbered_heading() {
12806 let mut doc = PdfDocument::new("leading-figure-carryover.pdf".to_string());
12807 doc.number_of_pages = 1;
12808 doc.kids.push(make_paragraph_at(
12809 72.0,
12810 742.0,
12811 540.0,
12812 756.0,
12813 "Figure 6. Mytella strigata biofouling green mussel farms in Bacoor City, Cavite, Manila Bay",
12814 ));
12815 doc.kids.push(make_heading_at(
12816 72.0,
12817 680.0,
12818 260.0,
12819 696.0,
12820 "5. Natural dispersal",
12821 ));
12822 doc.kids.push(make_paragraph_at(
12823 72.0,
12824 640.0,
12825 540.0,
12826 654.0,
12827 "Dispersal by purely natural means is not included as a pathway of biological invasions.",
12828 ));
12829
12830 let md = to_markdown(&doc).unwrap();
12831 assert!(md.starts_with("# 5. Natural dispersal"));
12832 assert!(!md.contains("Figure 6. Mytella strigata"));
12833 }
12834
12835 #[test]
12836 fn test_list_renderer_strips_duplicate_bullets_and_skips_bullet_only_items() {
12837 let mut doc = PdfDocument::new("bullets.pdf".to_string());
12838 doc.kids.push(make_fallback_list(&[
12839 "• First item",
12840 "•",
12841 "• Second item",
12842 "133",
12843 ]));
12844
12845 let md = to_markdown(&doc).unwrap();
12846 assert!(md.contains("- First item"));
12847 assert!(md.contains("- Second item"));
12848 assert!(!md.contains("- • First item"));
12849 assert!(!md.contains("\n- •\n"));
12850 assert!(!md.contains("\n- 133\n"));
12851 }
12852
12853 #[test]
12854 fn test_list_renderer_merges_wrapped_continuation_items() {
12855 let mut doc = PdfDocument::new("wrapped-list.pdf".to_string());
12856 doc.kids.push(make_fallback_list(&[
12857 "Use a micropipette to add 2 μL of loading dye",
12858 "and down a couple of times to mix the loading dye with the digested DNA.",
12859 "Use a fresh pipet tip for each reaction tube.",
12860 ]));
12861
12862 let md = to_markdown(&doc).unwrap();
12863 assert!(md.contains(
12864 "- Use a micropipette to add 2 μL of loading dye and down a couple of times to mix the loading dye with the digested DNA."
12865 ));
12866 assert!(md.contains("- Use a fresh pipet tip for each reaction tube."));
12867 assert!(!md.contains("\n- and down"));
12868 }
12869
12870 #[test]
12871 fn test_list_renderer_keeps_enumerated_items_separate() {
12872 let mut doc = PdfDocument::new("enumerated-list.pdf".to_string());
12873 doc.kids.push(make_fallback_list(&[
12874 "iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
12875 "iv. This business model will be operating cost-oriented not capital cost-oriented.",
12876 "v. Assumed selling price of wood pellet is $100 per tonne and appropriate.",
12877 ]));
12878
12879 let md = to_markdown(&doc).unwrap();
12880 assert!(md.contains("iii. Looking at cost items, the cost of raw woods procurement will be highest share.\niv. This business model will be operating cost-oriented not capital cost-oriented.\nv. Assumed selling price of wood pellet is $100 per tonne and appropriate."));
12881 assert!(!md.contains("- iii."));
12882 }
12883
12884 #[test]
12885 fn test_postprocess_drops_isolated_single_char_noise_lines() {
12886 let markdown = "# The Data Journey\n\n1\n\nTo get started.\n\no\n\nNOTE: Keep going.\n";
12887 let cleaned = drop_isolated_noise_lines(markdown);
12888 assert!(!cleaned.contains("\n1\n"));
12889 assert!(!cleaned.contains("\no\n"));
12890 assert!(cleaned.contains("To get started."));
12891 assert!(cleaned.contains("NOTE: Keep going."));
12892 }
12893
12894 fn make_two_column_table(rows: &[(&str, &str)]) -> ContentElement {
12895 let mut table_rows = Vec::new();
12896 for (row_number, (left, right)) in rows.iter().enumerate() {
12897 let top = 656.0 - row_number as f64 * 18.0;
12898 let bottom = top - 16.0;
12899 let mut cells = Vec::new();
12900 for (col_number, (text, left_x, right_x)) in
12901 [(*left, 72.0, 220.0), (*right, 220.0, 420.0)]
12902 .into_iter()
12903 .enumerate()
12904 {
12905 let content = if text.is_empty() {
12906 Vec::new()
12907 } else {
12908 vec![TableToken {
12909 base: TextChunk {
12910 value: text.to_string(),
12911 bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
12912 font_name: "Test".to_string(),
12913 font_size: 11.0,
12914 font_weight: 400.0,
12915 italic_angle: 0.0,
12916 font_color: "[0.0]".to_string(),
12917 contrast_ratio: 21.0,
12918 symbol_ends: Vec::new(),
12919 text_format: TextFormat::Normal,
12920 text_type: TextType::Regular,
12921 pdf_layer: PdfLayer::Main,
12922 ocg_visible: true,
12923 index: None,
12924 page_number: Some(1),
12925 level: None,
12926 mcid: None,
12927 },
12928 token_type: TableTokenType::Text,
12929 }]
12930 };
12931 cells.push(TableBorderCell {
12932 bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
12933 index: None,
12934 level: None,
12935 row_number,
12936 col_number,
12937 row_span: 1,
12938 col_span: 1,
12939 content,
12940 contents: vec![],
12941 semantic_type: None,
12942 });
12943 }
12944
12945 table_rows.push(TableBorderRow {
12946 bbox: BoundingBox::new(Some(1), 72.0, bottom, 420.0, top),
12947 index: None,
12948 level: None,
12949 row_number,
12950 cells,
12951 semantic_type: None,
12952 });
12953 }
12954
12955 ContentElement::TableBorder(TableBorder {
12956 bbox: BoundingBox::new(
12957 Some(1),
12958 72.0,
12959 656.0 - rows.len() as f64 * 18.0 - 16.0,
12960 420.0,
12961 656.0,
12962 ),
12963 index: None,
12964 level: Some("1".to_string()),
12965 x_coordinates: vec![72.0, 220.0, 420.0],
12966 x_widths: vec![0.0; 3],
12967 y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
12968 y_widths: vec![0.0; rows.len() + 1],
12969 rows: table_rows,
12970 num_rows: rows.len(),
12971 num_columns: 2,
12972 is_bad_table: false,
12973 is_table_transformer: false,
12974 previous_table: None,
12975 next_table: None,
12976 })
12977 }
12978
12979 fn make_chunked_paragraph_line(
12980 segments: &[(&str, f64, f64)],
12981 bottom: f64,
12982 top: f64,
12983 ) -> ContentElement {
12984 let bbox = BoundingBox::new(
12985 Some(1),
12986 segments.first().map(|(_, left, _)| *left).unwrap_or(72.0),
12987 bottom,
12988 segments.last().map(|(_, _, right)| *right).unwrap_or(320.0),
12989 top,
12990 );
12991
12992 let chunks = segments
12993 .iter()
12994 .map(|(text, left, right)| TextChunk {
12995 value: (*text).to_string(),
12996 bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
12997 font_name: "Lato-Regular".to_string(),
12998 font_size: top - bottom,
12999 font_weight: 400.0,
13000 italic_angle: 0.0,
13001 font_color: "#000000".to_string(),
13002 contrast_ratio: 21.0,
13003 symbol_ends: vec![],
13004 text_format: TextFormat::Normal,
13005 text_type: TextType::Regular,
13006 pdf_layer: PdfLayer::Main,
13007 ocg_visible: true,
13008 index: None,
13009 page_number: Some(1),
13010 level: None,
13011 mcid: None,
13012 })
13013 .collect::<Vec<_>>();
13014
13015 let line = TextLine {
13016 bbox: bbox.clone(),
13017 index: None,
13018 level: None,
13019 font_size: top - bottom,
13020 base_line: bottom + 2.0,
13021 slant_degree: 0.0,
13022 is_hidden_text: false,
13023 text_chunks: chunks,
13024 is_line_start: true,
13025 is_line_end: true,
13026 is_list_line: false,
13027 connected_line_art_label: None,
13028 };
13029 let block = TextBlock {
13030 bbox: bbox.clone(),
13031 index: None,
13032 level: None,
13033 font_size: line.font_size,
13034 base_line: line.base_line,
13035 slant_degree: 0.0,
13036 is_hidden_text: false,
13037 text_lines: vec![line],
13038 has_start_line: true,
13039 has_end_line: true,
13040 text_alignment: None,
13041 };
13042 let column = TextColumn {
13043 bbox: bbox.clone(),
13044 index: None,
13045 level: None,
13046 font_size: block.font_size,
13047 base_line: block.base_line,
13048 slant_degree: 0.0,
13049 is_hidden_text: false,
13050 text_blocks: vec![block],
13051 };
13052
13053 ContentElement::Paragraph(SemanticParagraph {
13054 base: SemanticTextNode {
13055 bbox,
13056 index: None,
13057 level: None,
13058 semantic_type: SemanticType::Paragraph,
13059 correct_semantic_score: None,
13060 columns: vec![column],
13061 font_weight: Some(400.0),
13062 font_size: Some(top - bottom),
13063 text_color: None,
13064 italic_angle: None,
13065 font_name: Some("Lato-Regular".to_string()),
13066 text_format: None,
13067 max_font_size: Some(top - bottom),
13068 background_color: None,
13069 is_hidden_text: false,
13070 },
13071 enclosed_top: false,
13072 enclosed_bottom: false,
13073 indentation: 0,
13074 })
13075 }
13076
13077 fn make_n_column_table(rows: &[Vec<&str>], column_bounds: &[(f64, f64)]) -> ContentElement {
13078 let mut table_rows = Vec::new();
13079 for (row_number, row_values) in rows.iter().enumerate() {
13080 let top = 656.0 - row_number as f64 * 18.0;
13081 let bottom = top - 16.0;
13082 let mut cells = Vec::new();
13083 for (col_number, (left_x, right_x)) in column_bounds.iter().enumerate() {
13084 let text = row_values.get(col_number).copied().unwrap_or("");
13085 let content = if text.is_empty() {
13086 Vec::new()
13087 } else {
13088 vec![TableToken {
13089 base: TextChunk {
13090 value: text.to_string(),
13091 bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
13092 font_name: "Test".to_string(),
13093 font_size: 11.0,
13094 font_weight: 400.0,
13095 italic_angle: 0.0,
13096 font_color: "[0.0]".to_string(),
13097 contrast_ratio: 21.0,
13098 symbol_ends: Vec::new(),
13099 text_format: TextFormat::Normal,
13100 text_type: TextType::Regular,
13101 pdf_layer: PdfLayer::Main,
13102 ocg_visible: true,
13103 index: None,
13104 page_number: Some(1),
13105 level: None,
13106 mcid: None,
13107 },
13108 token_type: TableTokenType::Text,
13109 }]
13110 };
13111 cells.push(TableBorderCell {
13112 bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
13113 index: None,
13114 level: None,
13115 row_number,
13116 col_number,
13117 row_span: 1,
13118 col_span: 1,
13119 content,
13120 contents: vec![],
13121 semantic_type: None,
13122 });
13123 }
13124
13125 table_rows.push(TableBorderRow {
13126 bbox: BoundingBox::new(
13127 Some(1),
13128 column_bounds.first().map(|(left, _)| *left).unwrap_or(72.0),
13129 bottom,
13130 column_bounds
13131 .last()
13132 .map(|(_, right)| *right)
13133 .unwrap_or(420.0),
13134 top,
13135 ),
13136 index: None,
13137 level: None,
13138 row_number,
13139 cells,
13140 semantic_type: None,
13141 });
13142 }
13143
13144 let left = column_bounds
13145 .first()
13146 .map(|(value, _)| *value)
13147 .unwrap_or(72.0);
13148 let right = column_bounds
13149 .last()
13150 .map(|(_, value)| *value)
13151 .unwrap_or(420.0);
13152 let x_coordinates = std::iter::once(left)
13153 .chain(column_bounds.iter().map(|(_, right)| *right))
13154 .collect::<Vec<_>>();
13155
13156 ContentElement::TableBorder(TableBorder {
13157 bbox: BoundingBox::new(
13158 Some(1),
13159 left,
13160 656.0 - rows.len() as f64 * 18.0 - 16.0,
13161 right,
13162 656.0,
13163 ),
13164 index: None,
13165 level: Some("1".to_string()),
13166 x_coordinates,
13167 x_widths: vec![0.0; column_bounds.len() + 1],
13168 y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
13169 y_widths: vec![0.0; rows.len() + 1],
13170 rows: table_rows,
13171 num_rows: rows.len(),
13172 num_columns: column_bounds.len(),
13173 is_bad_table: false,
13174 is_table_transformer: false,
13175 previous_table: None,
13176 next_table: None,
13177 })
13178 }
13179
13180 #[test]
13181 fn test_numeric_two_column_table_is_not_misrendered_as_toc() {
13182 let mut doc = PdfDocument::new("cec-table.pdf".to_string());
13183 doc.number_of_pages = 1;
13184 doc.kids.push(make_two_column_table(&[
13185 ("Mineral or colloid type", "CEC of pure colloid"),
13186 ("", "cmolc/kg"),
13187 ("kaolinite", "10"),
13188 ("illite", "30"),
13189 ]));
13190
13191 let md = to_markdown(&doc).unwrap();
13192 assert!(md.contains("| --- | --- |"));
13193 assert!(md.contains("| kaolinite | 10 |"));
13194 }
13195
13196 #[test]
13197 fn test_single_caption_chart_renderer_skips_documents_with_populated_tables() {
13198 let mut doc = PdfDocument::new("table-with-caption.pdf".to_string());
13199 doc.number_of_pages = 1;
13200 for idx in 0..10 {
13201 let bottom = 720.0 - idx as f64 * 18.0;
13202 doc.kids.push(make_paragraph(
13203 "Explanatory body text that should remain outside the chart-only renderer.",
13204 bottom,
13205 bottom + 10.0,
13206 ));
13207 }
13208 doc.kids.push(make_paragraph(
13209 "Figure 7.2: Kinematic Viscosity of Water at Atmospheric Pressure.",
13210 150.0,
13211 162.0,
13212 ));
13213 doc.kids.push(make_two_column_table(&[
13214 ("Temperature", "Viscosity"),
13215 ("20", "1.004"),
13216 ("25", "0.893"),
13217 ]));
13218
13219 assert!(render_layout_single_caption_chart_document(&doc).is_none());
13220 }
13221
13222 #[test]
13223 fn test_blank_right_column_table_is_not_misrendered_as_toc() {
13224 let mut doc = PdfDocument::new("flocculation-table.pdf".to_string());
13225 doc.number_of_pages = 1;
13226 doc.kids.push(make_two_column_table(&[
13227 (
13228 "Added cation",
13229 "Relative Size & Settling Rates of Floccules",
13230 ),
13231 ("K+", ""),
13232 ("Na+", ""),
13233 ("Ca2+", ""),
13234 ]));
13235
13236 let md = to_markdown(&doc).unwrap();
13237 assert!(md.contains("| Added cation | Relative Size & Settling Rates of Floccules |"));
13238 assert!(md.contains("| K+ | |"));
13239 }
13240
13241 #[test]
13242 fn test_infographic_card_table_renders_as_numbered_item() {
13243 let mut doc = PdfDocument::new("infographic-card.pdf".to_string());
13244 doc.number_of_pages = 1;
13245 doc.kids.push(make_two_column_table(&[
13246 (
13247 "1",
13248 "We're all both consumers and creators of creative work.",
13249 ),
13250 (
13251 "",
13252 "As consumers, we watch movies, listen to music, read books, and more.",
13253 ),
13254 ]));
13255
13256 let md = to_markdown(&doc).unwrap();
13257 assert!(md.contains(
13258 "1. We're all both consumers and creators of creative work. As consumers, we watch movies, listen to music, read books, and more."
13259 ));
13260 assert!(!md.contains("| 1 |"));
13261 }
13262
13263 #[test]
13264 fn test_grouped_header_rows_are_preserved_without_flattening() {
13265 let mut doc = PdfDocument::new("grouped-header.pdf".to_string());
13266 doc.number_of_pages = 1;
13267 doc.kids.push(make_n_column_table(
13268 &[
13269 vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
13270 vec![
13271 "",
13272 "Alpaca-GPT4",
13273 "OpenOrca",
13274 "Synth. Math-Instruct",
13275 "Orca DPO Pairs",
13276 "Ultrafeedback Cleaned",
13277 "Synth. Math-Alignment",
13278 ],
13279 vec![
13280 "Total # Samples",
13281 "52K",
13282 "2.91M",
13283 "126K",
13284 "12.9K",
13285 "60.8K",
13286 "126K",
13287 ],
13288 ],
13289 &[
13290 (72.0, 120.0),
13291 (120.0, 170.0),
13292 (170.0, 220.0),
13293 (220.0, 280.0),
13294 (280.0, 340.0),
13295 (340.0, 410.0),
13296 (410.0, 470.0),
13297 ],
13298 ));
13299
13300 let md = to_markdown(&doc).unwrap();
13301 assert!(md.contains(
13302 "| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"
13303 ));
13304 assert!(md.contains(
13305 "| | Alpaca-GPT4 | OpenOrca | Synth. Math-Instruct | Orca DPO Pairs | Ultrafeedback Cleaned | Synth. Math-Alignment |"
13306 ));
13307 assert!(!md.contains("Instruction OpenOrca"));
13308 assert!(!md.contains("Alignment Ultrafeedback"));
13309 }
13310
13311 #[test]
13312 fn test_top_table_plate_renderer_stops_before_article_body() {
13313 let mut doc = PdfDocument::new("table-plate.pdf".to_string());
13314 doc.number_of_pages = 1;
13315 doc.kids
13316 .push(make_paragraph_at(72.0, 724.0, 200.0, 736.0, "SOLAR 10.7B"));
13317 doc.kids.push(make_paragraph_at(
13318 72.0,
13319 704.0,
13320 220.0,
13321 716.0,
13322 "Training datasets",
13323 ));
13324 doc.kids.push(make_n_column_table(
13325 &[
13326 vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
13327 vec![
13328 "",
13329 "Alpaca-GPT4",
13330 "OpenOrca",
13331 "Synth. Math-Instruct",
13332 "Orca DPO Pairs",
13333 "Ultrafeedback Cleaned",
13334 "Synth. Math-Alignment",
13335 ],
13336 vec![
13337 "Total # Samples",
13338 "52K",
13339 "2.91M",
13340 "126K",
13341 "12.9K",
13342 "60.8K",
13343 "126K",
13344 ],
13345 vec![
13346 "Maximum # Samples Used",
13347 "52K",
13348 "100K",
13349 "52K",
13350 "12.9K",
13351 "60.8K",
13352 "20.1K",
13353 ],
13354 vec!["Open Source", "O", "O", "✗", "O", "O", "✗"],
13355 ],
13356 &[
13357 (78.0, 125.0),
13358 (125.0, 175.0),
13359 (175.0, 225.0),
13360 (225.0, 285.0),
13361 (285.0, 345.0),
13362 (345.0, 415.0),
13363 (415.0, 490.0),
13364 ],
13365 ));
13366 doc.kids.push(make_paragraph_at(
13367 72.0,
13368 500.0,
13369 310.0,
13370 514.0,
13371 "Table 1: Training datasets used for the instruction and alignment tuning stages, respectively.",
13372 ));
13373 doc.kids.push(make_paragraph_at(
13374 286.0,
13375 484.0,
13376 526.0,
13377 498.0,
13378 "Open source indicates whether the dataset is open-sourced.",
13379 ));
13380 doc.kids.push(make_paragraph_at(
13381 72.0,
13382 360.0,
13383 290.0,
13384 388.0,
13385 "Comparison to other up-scaling methods. Unlike Komatsuzaki et al. (2022)...",
13386 ));
13387
13388 let md = to_markdown(&doc).unwrap();
13389 assert!(md.contains("Table 1: Training datasets used for the instruction"));
13390 assert!(md.contains("| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"));
13391 assert!(!md.contains("Comparison to other up-scaling methods"));
13392 }
13393
13394 #[test]
13395 fn test_late_section_boundary_renderer_drops_equation_carryover() {
13396 let mut doc = PdfDocument::new("late-section.pdf".to_string());
13397 doc.number_of_pages = 1;
13398 doc.kids.push(make_paragraph_at(
13399 72.0,
13400 700.0,
13401 540.0,
13402 714.0,
13403 "The horizontal distance traveled by the jet is equal to:",
13404 ));
13405 doc.kids.push(make_paragraph_at(
13406 72.0,
13407 640.0,
13408 540.0,
13409 654.0,
13410 "The vertical position of the jet may be calculated as:",
13411 ));
13412 doc.kids.push(make_paragraph_at(
13413 72.0,
13414 580.0,
13415 260.0,
13416 594.0,
13417 "Rearranging Equation (8) gives:",
13418 ));
13419 doc.kids.push(make_paragraph_at(
13420 72.0,
13421 520.0,
13422 420.0,
13423 534.0,
13424 "Substitution into Equation 7 results in:",
13425 ));
13426 doc.kids.push(make_paragraph_at(
13427 72.0,
13428 460.0,
13429 280.0,
13430 474.0,
13431 "Equations (10) can be rearranged to find Cv:",
13432 ));
13433 doc.kids.push(make_heading_at(
13434 72.0,
13435 350.0,
13436 420.0,
13437 366.0,
13438 "7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE",
13439 ));
13440 doc.kids.push(make_paragraph_at(
13441 72.0,
13442 326.0,
13443 380.0,
13444 340.0,
13445 "If C_d is assumed to be constant, then a graph of Q plotted against",
13446 ));
13447 doc.kids.push(make_paragraph_at(
13448 400.0,
13449 326.0,
13450 540.0,
13451 340.0,
13452 "(Equation 6) will be linear, and",
13453 ));
13454 doc.kids.push(make_paragraph_at(
13455 72.0,
13456 310.0,
13457 240.0,
13458 324.0,
13459 "the slope of this graph will be:",
13460 ));
13461 doc.kids.push(make_paragraph_at(
13462 360.0,
13463 36.0,
13464 550.0,
13465 48.0,
13466 "EXPERIMENT #6: ORIFICE AND FREE JET FLOW 53",
13467 ));
13468
13469 let md = to_markdown(&doc).unwrap();
13470 assert!(md.starts_with("# 7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE"));
13471 assert!(md.contains(
13472 "If C_d is assumed to be constant, then a graph of Q plotted against (Equation 6) will be linear, and the slope of this graph will be:"
13473 ));
13474 assert!(!md.contains("The horizontal distance traveled by the jet"));
13475 assert!(!md.contains("EXPERIMENT #6"));
13476 }
13477
13478 #[test]
13479 fn test_leading_table_carryover_row_is_trimmed_from_general_renderer() {
13480 let mut doc = PdfDocument::new("carryover-table.pdf".to_string());
13481 doc.number_of_pages = 1;
13482 doc.kids.push(make_n_column_table(
13483 &[
13484 vec![
13485 "Jurisdiction",
13486 "GATS XVII Reservation (1994)",
13487 "Foreign Ownership Permitted",
13488 "Restrictions on Foreign Ownership",
13489 "Foreign Ownership Reporting Requirements",
13490 ],
13491 vec![
13492 "",
13493 "",
13494 "",
13495 "right required to acquire desert lands and continue the prior page",
13496 "",
13497 ],
13498 vec!["Finland", "N", "Y", "Prior approval may be required.", ""],
13499 vec!["France", "N", "Y", "None.", ""],
13500 ],
13501 &[
13502 (72.0, 150.0),
13503 (150.0, 235.0),
13504 (235.0, 330.0),
13505 (330.0, 500.0),
13506 (500.0, 560.0),
13507 ],
13508 ));
13509
13510 let md = to_markdown(&doc).unwrap();
13511 assert!(!md.contains("right required to acquire desert lands"));
13512 assert!(md.contains("| Finland | N | Y | Prior approval may be required. | |"));
13513 }
13514
13515 #[test]
13516 fn test_single_table_report_renderer_promotes_title_and_skips_footer() {
13517 let mut doc = PdfDocument::new("single-table-report.pdf".to_string());
13518 doc.number_of_pages = 1;
13519 doc.kids.push(make_paragraph_at(
13520 140.0,
13521 674.0,
13522 474.0,
13523 688.0,
13524 "Restrictions on Land Ownership by Foreigners in Selected Jurisdictions",
13525 ));
13526 doc.kids.push(make_n_column_table(
13527 &[
13528 vec![
13529 "Jurisdiction",
13530 "GATS XVII Reservation (1994)",
13531 "Foreign Ownership Permitted",
13532 "Restrictions on Foreign Ownership",
13533 "Foreign Ownership Reporting Requirements",
13534 ],
13535 vec![
13536 "",
13537 "",
13538 "",
13539 "right required to acquire desert lands and continue the prior page",
13540 "",
13541 ],
13542 vec![
13543 "Finland",
13544 "N",
13545 "Y",
13546 "Prior approval from the Government of Aland may be required.",
13547 "",
13548 ],
13549 vec!["France", "N", "Y", "None.", ""],
13550 ],
13551 &[
13552 (72.0, 150.0),
13553 (150.0, 235.0),
13554 (235.0, 330.0),
13555 (330.0, 500.0),
13556 (500.0, 560.0),
13557 ],
13558 ));
13559 doc.kids.push(make_paragraph_at(
13560 350.0,
13561 36.0,
13562 548.0,
13563 48.0,
13564 "The Law Library of Congress 7",
13565 ));
13566
13567 let md = to_markdown(&doc).unwrap();
13568 assert!(md.starts_with(
13569 "# Restrictions on Land Ownership by Foreigners in Selected Jurisdictions"
13570 ));
13571 assert!(!md.contains("right required to acquire desert lands"));
13572 assert!(!md.contains("The Law Library of Congress 7"));
13573 assert!(md.contains(
13574 "| Finland | N | Y | Prior approval from the Government of Aland may be required. | |"
13575 ));
13576 }
13577
13578 #[test]
13579 fn test_hyphenated_table_title_continuation_renders_as_heading() {
13580 let mut doc = PdfDocument::new("hyphenated-table-title.pdf".to_string());
13581 doc.number_of_pages = 1;
13582 doc.kids.push(make_paragraph_at(
13583 72.0,
13584 724.0,
13585 520.0,
13586 738.0,
13587 "With this in mind, here we have the 7 key competence areas selected to form a part of Eco-",
13588 ));
13589 doc.kids.push(make_paragraph_at(
13590 72.0,
13591 704.0,
13592 260.0,
13593 718.0,
13594 "Circle's Competence Framework:",
13595 ));
13596 doc.kids.push(make_n_column_table(
13597 &[
13598 vec!["Eco-Circle Competence Framework"],
13599 vec!["#1: The 3 Rs: Recycle-Reuse-Reduce"],
13600 vec!["#2: Lifecycle of Circular Economy"],
13601 ],
13602 &[(140.0, 460.0)],
13603 ));
13604
13605 let md = to_markdown(&doc).unwrap();
13606 assert!(md.contains("# Circle's Competence Framework:"), "{md}");
13607 }
13608
13609 #[test]
13610 fn test_duplicate_table_header_heading_is_demoted() {
13611 let mut doc = PdfDocument::new("duplicate-table-header-heading.pdf".to_string());
13612 doc.number_of_pages = 1;
13613 doc.kids
13614 .push(make_heading("MOHAVE COMMUNITY COLLEGE BIO181"));
13615 doc.kids.push(make_n_column_table(
13616 &[
13617 vec![
13618 "",
13619 "Saccharometer",
13620 "DI Water",
13621 "Glucose Solution",
13622 "Yeast Suspension",
13623 ],
13624 vec!["1", "", "8 ml", "6 ml", "0 ml"],
13625 vec!["2", "", "12 ml", "0 ml", "2 ml"],
13626 vec!["3", "", "6 ml", "6 ml", "2 ml"],
13627 ],
13628 &[
13629 (72.0, 110.0),
13630 (110.0, 210.0),
13631 (210.0, 300.0),
13632 (300.0, 430.0),
13633 (430.0, 540.0),
13634 ],
13635 ));
13636 doc.kids.push(make_heading_at(
13637 72.0,
13638 92.0,
13639 390.0,
13640 108.0,
13641 "Saccharometer DI Water Glucose Solution Yeast Suspension",
13642 ));
13643 doc.kids
13644 .push(make_paragraph_at(72.0, 72.0, 120.0, 88.0, "below"));
13645 doc.kids
13646 .push(make_paragraph_at(72.0, 56.0, 240.0, 72.0, "1 16 ml 12 ml"));
13647 doc.kids
13648 .push(make_paragraph_at(296.0, 56.0, 340.0, 72.0, "0 ml"));
13649
13650 let md = to_markdown(&doc).unwrap();
13651 assert!(
13652 md.contains("Saccharometer DI Water Glucose Solution Yeast Suspension"),
13653 "{md}"
13654 );
13655 assert!(
13656 !md.contains("# Saccharometer DI Water Glucose Solution Yeast Suspension"),
13657 "{md}"
13658 );
13659 }
13660
13661 #[test]
13662 fn test_geometric_panel_headers_are_promoted_into_table() {
13663 let mut doc = PdfDocument::new("ai-pack-panel.pdf".to_string());
13664 doc.kids.push(make_chunked_paragraph_line(
13665 &[("OCR", 220.0, 250.0)],
13666 720.0,
13667 732.0,
13668 ));
13669 doc.kids.push(make_chunked_paragraph_line(
13670 &[("Recommendation", 430.0, 540.0)],
13671 720.0,
13672 732.0,
13673 ));
13674 doc.kids.push(make_chunked_paragraph_line(
13675 &[("Product semantic search", 660.0, 860.0)],
13676 720.0,
13677 732.0,
13678 ));
13679 doc.kids.push(make_chunked_paragraph_line(
13680 &[("Pack", 72.0, 110.0)],
13681 684.0,
13682 696.0,
13683 ));
13684 doc.kids.push(make_chunked_paragraph_line(
13685 &[("A solution that recognizes characters", 140.0, 340.0)],
13686 684.0,
13687 696.0,
13688 ));
13689 doc.kids.push(make_chunked_paragraph_line(
13690 &[("A solution that recommends the best products", 390.0, 620.0)],
13691 684.0,
13692 696.0,
13693 ));
13694 doc.kids.push(make_chunked_paragraph_line(
13695 &[("A solution that enables semantic search", 650.0, 900.0)],
13696 684.0,
13697 696.0,
13698 ));
13699 doc.kids.push(make_n_column_table(
13700 &[
13701 vec![
13702 "Achieved 1st place in the OCR World Competition",
13703 "Team with specialists and technologies",
13704 "Creation of the first natural language evaluation",
13705 ],
13706 vec![
13707 "The team includes specialists who have",
13708 "received Kaggle's Gold Medal recommendation",
13709 "system in Korean (KLUE)",
13710 ],
13711 vec![
13712 "presented 14 papers in renowned AI conferences",
13713 "top-tier recommendation",
13714 "Shopee subject",
13715 ],
13716 ],
13717 &[(120.0, 360.0), (360.0, 630.0), (630.0, 910.0)],
13718 ));
13719 doc.kids.push(make_chunked_paragraph_line(
13720 &[("models", 430.0, 490.0)],
13721 552.0,
13722 564.0,
13723 ));
13724
13725 let md = to_markdown(&doc).unwrap();
13726 assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
13727 assert!(md.contains("| A solution that recognizes characters | A solution that recommends the best products | A solution that enables semantic search |"));
13728 assert!(md.contains(
13729 "received Kaggle's Gold Medal recommendation top-tier recommendation models"
13730 ));
13731 }
13732
13733 #[test]
13734 fn test_embedded_stub_header_is_promoted_from_first_table_column() {
13735 let mut doc = PdfDocument::new("embedded-stub-header.pdf".to_string());
13736 doc.kids.push(make_chunked_paragraph_line(
13737 &[("OCR", 220.0, 250.0)],
13738 720.0,
13739 732.0,
13740 ));
13741 doc.kids.push(make_chunked_paragraph_line(
13742 &[("Recommendation", 430.0, 540.0)],
13743 720.0,
13744 732.0,
13745 ));
13746 doc.kids.push(make_chunked_paragraph_line(
13747 &[("Product semantic search", 660.0, 860.0)],
13748 720.0,
13749 732.0,
13750 ));
13751 doc.kids.push(make_n_column_table(
13752 &[
13753 vec![
13754 "Pack",
13755 "A solution that recognizes characters in an image and extracts necessary information",
13756 "A solution that recommends the best products and contents",
13757 "A solution that enables semantic search and organizes key information",
13758 ],
13759 vec![
13760 "Application",
13761 "Applicable to all fields that require text extraction",
13762 "Applicable to all fields that use any form of recommendation",
13763 "Applicable to all fields that deal with unstructured data",
13764 ],
13765 vec![
13766 "Highlight",
13767 "Achieved 1st place in the OCR World Competition",
13768 "Received Kaggle's Gold Medal recommendation",
13769 "Creation of the first natural language evaluation system in Korean",
13770 ],
13771 ],
13772 &[
13773 (72.0, 120.0),
13774 (120.0, 360.0),
13775 (360.0, 630.0),
13776 (630.0, 910.0),
13777 ],
13778 ));
13779
13780 let md = to_markdown(&doc).unwrap();
13781 assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
13782 assert!(
13783 md.contains("| Application | Applicable to all fields that require text extraction |")
13784 );
13785 assert!(md.contains("| Highlight | Achieved 1st place in the OCR World Competition |"));
13786 assert!(!md.contains("OCR\n\nRecommendation\n\nProduct semantic search"));
13787 }
13788
13789 #[test]
13790 fn test_geometric_chunk_alignment_splits_header_line_into_columns() {
13791 let line = make_chunked_paragraph_line(
13792 &[
13793 ("Properties", 72.0, 145.0),
13794 ("Instruction", 180.0, 255.0),
13795 ("Alignment", 480.0, 545.0),
13796 ],
13797 720.0,
13798 732.0,
13799 );
13800 let chunk_lines = extract_chunk_lines(&line);
13801 let fragments = split_line_into_slot_fragments(
13802 &chunk_lines[0],
13803 &[
13804 (72.0, 170.0),
13805 (170.0, 280.0),
13806 (280.0, 380.0),
13807 (380.0, 480.0),
13808 (480.0, 600.0),
13809 (600.0, 720.0),
13810 (720.0, 850.0),
13811 ],
13812 );
13813
13814 assert_eq!(fragments.len(), 3);
13815 assert_eq!(fragments[0].slot_idx, 0);
13816 assert_eq!(fragments[0].text, "Properties");
13817 assert_eq!(fragments[1].slot_idx, 1);
13818 assert_eq!(fragments[1].text, "Instruction");
13819 assert_eq!(fragments[2].slot_idx, 4);
13820 assert_eq!(fragments[2].text, "Alignment");
13821 }
13822
13823 #[test]
13824 fn test_merge_tables_across_heading() {
13825 let input = "some text\n\n\
13826 | Area | Competence |\n\
13827 | --- | --- |\n\
13828 | Row1 | Val1 |\n\
13829 | Row2 | Val2 |\n\
13830 \n\
13831 # Heading Between\n\
13832 \n\
13833 | Row3 | Val3 |\n\
13834 | --- | --- |\n\
13835 \n\
13836 more text\n";
13837 let result = merge_adjacent_pipe_tables(input);
13838 assert!(
13840 result.contains("| Heading Between |"),
13841 "Heading should be in pipe row: {}",
13842 result
13843 );
13844 assert!(
13846 !result.contains("# Heading Between"),
13847 "Heading marker should be removed: {}",
13848 result
13849 );
13850 assert!(
13852 result.contains("| Row3 |") || result.contains("Row3"),
13853 "Row3 should exist: {}",
13854 result
13855 );
13856 }
13857
13858 #[test]
13859 fn test_merge_tables_does_not_cross_distinct_headers() {
13860 let input = "| Model | Score |\n\
13861 | --- | --- |\n\
13862 | A | 1 |\n\
13863 \n\
13864 Table 6: Performance comparison amongst the merge candidates.\n\
13865 \n\
13866 | Model | Method | Score |\n\
13867 | --- | --- | --- |\n\
13868 | B | Avg | 2 |\n";
13869 let result = merge_adjacent_pipe_tables(input);
13870
13871 assert!(result.contains("Table 6: Performance comparison amongst the merge candidates."));
13872 assert!(result.contains("| Model | Score |"));
13873 assert!(result.contains("| Model | Method | Score |"));
13874 assert!(
13875 !result.contains("| Table 6: Performance comparison amongst the merge candidates. |")
13876 );
13877 }
13878
13879 #[test]
13880 fn test_normalize_chart_like_markdown_extracts_series_tables() {
13881 let input = "Figure 1.7. Non-citizen population in Malaysia (in thousands) 3,323 3,500 3,288 3,230 3,140 2,907 3,000 2,693 2,500 2,000 1,500 1,000 500 0\n\n\
13882 2016 2017 2018 2019 2020 2021 Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.\n\n\
13883 ASEAN Migration Outlook 19\n";
13884
13885 let normalized = normalize_chart_like_markdown(input);
13886 assert!(
13887 normalized.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
13888 );
13889 assert!(normalized.contains("| 2016 | 3,323 |"));
13890 assert!(normalized.contains("| 2021 | 2,693 |"));
13891 assert!(normalized.contains(
13892 "*Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.*"
13893 ));
13894 assert!(!normalized.contains("ASEAN Migration Outlook 19"));
13895 }
13896
13897 #[test]
13898 fn test_normalize_chart_like_markdown_promotes_structural_captions() {
13899 let input = "Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or\n\n\
13900 The Wonderful Lamp.\n\n\
13901 Body paragraph.\n";
13902
13903 let normalized = normalize_chart_like_markdown(input);
13904 assert!(normalized.contains(
13905 "## Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or The Wonderful Lamp"
13906 ));
13907 assert!(normalized.contains("Body paragraph."));
13908 }
13909
13910 #[test]
13911 fn test_normalize_chart_like_markdown_reconstructs_header_pair_chart_table() {
13912 let input = "Figure 4.8. Domestic Wood Pellets Production\n\n\
13913 | 8 | 800 200 | 126 2014 | 120 2015 | 120 2016 | 127 2017 | 131 2018 | 147 2019 |\n\
13914 | --- | --- | --- | --- | --- | --- | --- | --- |\n\n\
13915 Source: Forestry Agency, Ministry of Agriculture, Forestry and Fishery (MAFF), 2020.\n";
13916
13917 let normalized = normalize_chart_like_markdown(input);
13918 assert!(normalized.contains("# Figure 4.8. Domestic Wood Pellets Production"));
13919 assert!(normalized.contains("| Year | Domestic Wood Pellets Production |"));
13920 assert!(normalized.contains("| 2014 | 126 |"));
13921 assert!(normalized.contains("| 2019 | 147 |"));
13922 assert!(!normalized.contains("| 8 | 800 200 |"));
13923 }
13924
13925 #[test]
13926 fn test_normalize_chart_like_markdown_drops_numeric_axis_artifact_table() {
13927 let input = "| 31 1 0 2 23 2 2 2 0 5 10 15 20 25 30 35 Event Celebration Information Videograph 2019 2020 |\n\
13928 | --- |\n\n\
13929 Distribution of Komnas HAM's YouTube Content (2019-2020)\n";
13930
13931 let normalized = normalize_chart_like_markdown(input);
13932 assert!(!normalized.contains("| --- |"));
13933 assert!(normalized.contains("Distribution of Komnas HAM's YouTube Content (2019-2020)"));
13934 }
13935
13936 #[test]
13937 fn test_normalize_chart_like_markdown_drops_url_fragment_table() {
13938 let input = "## Figure 6 DPN Argentina Content: World Health Day Celebration\n\n\
13939 | na/status/1379765916259483648 |\n\
13940 | --- |\n\n\
13941 98 DPN Argentina, accessed on 5 December 2021.\n";
13942
13943 let normalized = normalize_chart_like_markdown(input);
13944 assert!(!normalized.contains("/status/1379765916259483648 |"));
13945 assert!(normalized.contains("98 DPN Argentina, accessed on 5 December 2021."));
13946 }
13947
13948 #[test]
13949 fn test_normalize_chart_like_markdown_drops_sparse_table_before_caption() {
13950 let input = "What’s unique about the growth of Alligator Gars is their fast growth.\n\n\
13951 | in | cm | | Length | of | Gar | Fish | Age |\n\
13952 | --- | --- | --- | --- | --- | --- | --- | --- |\n\
13953 | 120) | 300 | | | | | | |\n\
13954 | 100+ | 250 | | | | | | |\n\
13955 | 80+ | 200 | | | | | | |\n\
13956 | 20. | 50 | G | | | | | Vi |\n\
13957 | 0 | 0 | | | | | | |\n\
13958 | | 0 | 10 | 30 | | 40 | 50 | 60 |\n\n\
13959 Figure 8.6: Growth in length of Alligator Gar in Texas.\n";
13960
13961 let normalized = normalize_chart_like_markdown(input);
13962 assert!(!normalized.contains("| in | cm |"));
13963 assert!(normalized.contains("Figure 8.6: Growth in length of Alligator Gar in Texas."));
13964 }
13965
13966 #[test]
13967 fn test_normalize_chart_like_markdown_trims_large_top_table_plate() {
13968 let input = "| A | B | C | D | E | F | G | H |\n\
13969 | --- | --- | --- | --- | --- | --- | --- | --- |\n\
13970 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13971 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13972 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13973 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13974 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13975 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13976 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13977 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\n\
13978 Table 2: Evaluation results for SOLAR 10.7B and SOLAR 10.7B-Instruct along with other top-performing models in the paper.\n\n\
13979 # 4.2 Main Results\n\n\
13980 The surrounding prose should be dropped.\n";
13981
13982 let normalized = normalize_chart_like_markdown(input);
13983 assert!(normalized.starts_with("| A | B | C | D | E | F | G | H |"));
13984 assert!(!normalized.contains("Table 2:"));
13985 assert!(!normalized.contains("4.2 Main Results"));
13986 assert!(!normalized.contains("surrounding prose"));
13987 }
13988}