1#[cfg(not(target_arch = "wasm32"))]
4use regex::Regex;
5use std::collections::{HashMap, HashSet};
6#[cfg(not(target_arch = "wasm32"))]
7use std::path::Path;
8#[cfg(not(target_arch = "wasm32"))]
9use std::process::Command;
10
11use crate::models::bbox::BoundingBox;
12use crate::models::chunks::TextChunk;
13use crate::models::content::ContentElement;
14use crate::models::document::PdfDocument;
15use crate::models::enums::SemanticType;
16use crate::models::semantic::SemanticTextNode;
17use crate::models::table::TableTokenRow;
18use crate::EdgePdfError;
19
20#[cfg(not(target_arch = "wasm32"))]
21struct CachedBBoxLayout {
22 page_width: f64,
23 lines: Vec<BBoxLayoutLine>,
24 blocks: Vec<BBoxLayoutBlock>,
25}
26
27#[cfg(not(target_arch = "wasm32"))]
28#[derive(Default)]
29struct LayoutSourceCache {
30 bbox_layout: Option<Option<CachedBBoxLayout>>,
31 layout_lines: Option<Option<Vec<String>>>,
32}
33
34#[cfg(not(target_arch = "wasm32"))]
35impl LayoutSourceCache {
36 fn bbox_layout(&mut self, doc: &PdfDocument) -> Option<&CachedBBoxLayout> {
37 if self.bbox_layout.is_none() {
38 let loaded = doc.source_path.as_deref().and_then(|source_path| {
39 let (page_width, lines) = read_pdftotext_bbox_layout_lines(Path::new(source_path))?;
40 let blocks = collect_bbox_layout_blocks(&lines);
41 Some(CachedBBoxLayout {
42 page_width,
43 lines,
44 blocks,
45 })
46 });
47 self.bbox_layout = Some(loaded);
48 }
49 self.bbox_layout.as_ref().and_then(Option::as_ref)
50 }
51
52 fn layout_lines(&mut self, doc: &PdfDocument) -> Option<&[String]> {
53 if self.layout_lines.is_none() {
54 let loaded = doc
55 .source_path
56 .as_deref()
57 .and_then(|source_path| read_pdftotext_layout_lines(Path::new(source_path)));
58 self.layout_lines = Some(loaded);
59 }
60 self.layout_lines
61 .as_ref()
62 .and_then(Option::as_ref)
63 .map(Vec::as_slice)
64 }
65}
66
67pub fn to_markdown(doc: &PdfDocument) -> Result<String, EdgePdfError> {
72 #[cfg(not(target_arch = "wasm32"))]
73 let mut layout_cache = LayoutSourceCache::default();
74 #[cfg(not(target_arch = "wasm32"))]
75 if let Some(rendered) = render_layout_open_plate_document_cached(doc, &mut layout_cache) {
76 return Ok(rendered);
77 }
78 #[cfg(not(target_arch = "wasm32"))]
79 if let Some(rendered) =
80 render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
81 {
82 return Ok(rendered);
83 }
84 #[cfg(not(target_arch = "wasm32"))]
85 if let Some(rendered) = render_layout_captioned_media_document_cached(doc, &mut layout_cache) {
86 return Ok(rendered);
87 }
88 #[cfg(not(target_arch = "wasm32"))]
89 if let Some(rendered) =
90 render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
91 {
92 return Ok(rendered);
93 }
94 #[cfg(not(target_arch = "wasm32"))]
95 if let Some(rendered) = render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
96 {
97 return Ok(rendered);
98 }
99 #[cfg(not(target_arch = "wasm32"))]
100 if let Some(rendered) = render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
101 {
102 return Ok(rendered);
103 }
104 #[cfg(not(target_arch = "wasm32"))]
105 if let Some(rendered) =
106 render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
107 {
108 return Ok(rendered);
109 }
110 #[cfg(not(target_arch = "wasm32"))]
111 if let Some(rendered) = render_layout_toc_document_cached(doc, &mut layout_cache) {
112 return Ok(rendered);
113 }
114 if looks_like_contents_document(doc) {
115 return Ok(render_contents_document(doc));
116 }
117 if looks_like_compact_toc_document(doc) {
118 return Ok(render_compact_toc_document(doc));
119 }
120 #[cfg(not(target_arch = "wasm32"))]
121 if let Some(rendered) = render_layout_projection_sheet_document_cached(doc, &mut layout_cache) {
122 return Ok(rendered);
123 }
124 #[cfg(not(target_arch = "wasm32"))]
125 if let Some(rendered) = render_layout_appendix_tables_document_cached(doc, &mut layout_cache) {
126 return Ok(rendered);
127 }
128 #[cfg(not(target_arch = "wasm32"))]
129 if let Some(rendered) = render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
130 {
131 return Ok(rendered);
132 }
133 #[cfg(not(target_arch = "wasm32"))]
134 if let Some(rendered) = render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
135 {
136 return Ok(rendered);
137 }
138 #[cfg(not(target_arch = "wasm32"))]
139 if let Some(rendered) =
140 render_layout_registration_report_document_cached(doc, &mut layout_cache)
141 {
142 return Ok(rendered);
143 }
144 if let Some(rendered) = render_top_table_plate_document(doc) {
145 return Ok(rendered);
146 }
147 if let Some(rendered) = render_single_table_report_document(doc) {
148 return Ok(rendered);
149 }
150 if let Some(rendered) = render_late_section_boundary_document(doc) {
151 return Ok(rendered);
152 }
153 #[cfg(not(target_arch = "wasm32"))]
154 if let Some(rendered) = render_layout_matrix_document_cached(doc, &mut layout_cache) {
155 return Ok(rendered);
156 }
157 #[cfg(not(target_arch = "wasm32"))]
158 if let Some(rendered) = render_layout_panel_stub_document_cached(doc, &mut layout_cache) {
159 return Ok(rendered);
160 }
161
162 Ok(render_markdown_core(doc))
163}
164
165fn render_markdown_core(doc: &PdfDocument) -> String {
166 let mut output = String::new();
167
168 if let Some(ref title) = doc.title {
170 let trimmed = title.trim();
171 if !trimmed.is_empty() && !should_skip_document_title(doc, trimmed) {
172 if should_render_document_title_as_plaintext(doc, trimmed) {
173 output.push_str(trimmed);
174 output.push_str("\n\n");
175 } else {
176 output.push_str(&format!("# {}\n\n", trimmed));
177 }
178 }
179 }
180
181 if doc.kids.is_empty() {
182 output.push_str("*No content extracted.*\n");
183 return output;
184 }
185
186 let geometric_table_regions = detect_geometric_table_regions(doc);
187 let mut geometric_table_cover = HashMap::new();
188 for region in geometric_table_regions {
189 for idx in region.start_idx..=region.end_idx {
190 geometric_table_cover.insert(idx, region.clone());
191 }
192 }
193
194 let mut i = 0usize;
195 while i < doc.kids.len() {
196 if let Some(region) = geometric_table_cover.get(&i) {
197 output.push_str(®ion.rendered);
198 i = region.end_idx + 1;
199 continue;
200 }
201
202 match &doc.kids[i] {
203 ContentElement::Heading(h) => {
204 let text = h.base.base.value();
205 let trimmed = text.trim();
206 if trimmed.is_empty() || should_skip_heading_text(trimmed) {
207 i += 1;
208 continue;
209 }
210
211 if looks_like_bottom_margin_heading(doc, i) {
214 output.push_str(&escape_md_line_start(trimmed));
215 output.push_str("\n\n");
216 i += 1;
217 continue;
218 }
219
220 if should_demote_period_heading(trimmed) {
223 output.push_str(&escape_md_line_start(trimmed));
224 output.push_str("\n\n");
225 i += 1;
226 continue;
227 }
228
229 if should_demote_comma_heading(trimmed) {
231 output.push_str(&escape_md_line_start(trimmed));
232 output.push_str("\n\n");
233 i += 1;
234 continue;
235 }
236
237 if should_demote_math_heading(trimmed) {
239 output.push_str(&escape_md_line_start(trimmed));
240 output.push_str("\n\n");
241 i += 1;
242 continue;
243 }
244
245 if should_demote_percentage_heading(trimmed) {
247 output.push_str(&escape_md_line_start(trimmed));
248 output.push_str("\n\n");
249 i += 1;
250 continue;
251 }
252
253 if starts_with_caption_prefix(trimmed) {
257 output.push_str(&escape_md_line_start(trimmed));
258 output.push_str("\n\n");
259 i += 1;
260 continue;
261 }
262
263 if should_demote_bibliography_heading(trimmed) {
266 output.push_str(&escape_md_line_start(trimmed));
267 output.push_str("\n\n");
268 i += 1;
269 continue;
270 }
271
272 if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
273 if should_demote_heading_to_paragraph(trimmed, &next_text) {
274 let mut merged = trimmed.to_string();
275 merge_paragraph_text(&mut merged, &next_text);
276 output.push_str(&escape_md_line_start(merged.trim()));
277 output.push_str("\n\n");
278 i += 2;
279 continue;
280 }
281 }
282
283 let mut merged_heading = trimmed.to_string();
287 while let Some(ContentElement::Heading(next_h)) = doc.kids.get(i + 1) {
288 let next_text = next_h.base.base.value();
289 let next_trimmed = next_text.trim();
290 if next_trimmed.is_empty() || should_skip_heading_text(next_trimmed) {
291 i += 1;
292 continue;
293 }
294 if merged_heading.len() + 1 + next_trimmed.len() > 200 {
296 break;
297 }
298 merge_paragraph_text(&mut merged_heading, next_trimmed);
299 i += 1;
300 }
301
302 let cleaned_heading = strip_trailing_page_number(merged_heading.trim());
303
304 if let Some(split_pos) = find_merged_subsection_split(cleaned_heading) {
306 let first = cleaned_heading[..split_pos].trim();
307 let second = cleaned_heading[split_pos..].trim();
308 output.push_str(&format!("# {}\n\n", first));
309 output.push_str(&format!("# {}\n\n", second));
310 } else {
311 output.push_str(&format!("# {}\n\n", cleaned_heading));
312 }
313 }
314 ContentElement::NumberHeading(nh) => {
315 let text = nh.base.base.base.value();
316 let trimmed = text.trim();
317 if trimmed.is_empty() || should_skip_heading_text(trimmed) {
318 i += 1;
319 continue;
320 }
321
322 if should_demote_comma_heading(trimmed) {
324 output.push_str(&escape_md_line_start(trimmed));
325 output.push_str("\n\n");
326 i += 1;
327 continue;
328 }
329
330 if should_demote_math_heading(trimmed) {
332 output.push_str(&escape_md_line_start(trimmed));
333 output.push_str("\n\n");
334 i += 1;
335 continue;
336 }
337
338 if should_demote_percentage_heading(trimmed) {
340 output.push_str(&escape_md_line_start(trimmed));
341 output.push_str("\n\n");
342 i += 1;
343 continue;
344 }
345
346 if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
347 if should_demote_heading_to_paragraph(trimmed, &next_text) {
348 let mut merged = trimmed.to_string();
349 merge_paragraph_text(&mut merged, &next_text);
350 output.push_str(&escape_md_line_start(merged.trim()));
351 output.push_str("\n\n");
352 i += 2;
353 continue;
354 }
355 }
356
357 let cleaned = strip_trailing_page_number(trimmed);
358
359 if let Some(split_pos) = find_merged_subsection_split(cleaned) {
361 let first = cleaned[..split_pos].trim();
362 let second = cleaned[split_pos..].trim();
363 output.push_str(&format!("# {}\n\n", first));
364 output.push_str(&format!("# {}\n\n", second));
365 } else {
366 output.push_str(&format!("# {}\n\n", cleaned));
367 }
368 }
369 ContentElement::Paragraph(_)
370 | ContentElement::TextBlock(_)
371 | ContentElement::TextLine(_) => {
372 let element = &doc.kids[i];
373 let text = match &doc.kids[i] {
374 ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
375 ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
376 ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
377 _ => unreachable!(),
378 };
379 let trimmed = text.trim();
380 if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
381 i += 1;
382 continue;
383 }
384 if should_skip_leading_figure_carryover(doc, i, trimmed) {
385 i += 1;
386 continue;
387 }
388
389 if should_render_paragraph_as_heading(doc, i, trimmed, doc.kids.get(i + 1)) {
390 let cleaned = strip_trailing_page_number(trimmed);
391 if let Some(split_pos) = find_merged_subsection_split(cleaned) {
393 let first = cleaned[..split_pos].trim();
394 let second = cleaned[split_pos..].trim();
395 output.push_str(&format!("# {}\n\n", first));
396 output.push_str(&format!("# {}\n\n", second));
397 } else {
398 output.push_str(&format!("# {}\n\n", cleaned));
399 }
400 i += 1;
401 continue;
402 }
403
404 if matches!(element, ContentElement::Paragraph(p) if p.base.semantic_type == SemanticType::TableOfContent)
405 {
406 output.push_str(&escape_md_line_start(trimmed));
407 output.push('\n');
408 i += 1;
409 continue;
410 }
411
412 if is_short_caption_label(trimmed) {
413 if let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
414 if let Some((caption_tail, body)) =
415 split_following_caption_tail_and_body(&next_text)
416 {
417 let mut caption = trimmed.to_string();
418 caption.push('\n');
419 caption.push_str(caption_tail);
420 output.push_str(&escape_md_line_start(caption.trim()));
421 output.push_str("\n\n");
422 output.push_str(&escape_md_line_start(body));
423 output.push_str("\n\n");
424 i += 2;
425 continue;
426 }
427
428 if looks_like_caption_tail(&next_text) {
429 let mut caption = trimmed.to_string();
430 caption.push('\n');
431 caption.push_str(next_text.trim());
432
433 if let Some(year_text) =
434 next_mergeable_paragraph_text(doc.kids.get(i + 2))
435 {
436 if looks_like_caption_year(&year_text) {
437 caption.push('\n');
438 caption.push_str(year_text.trim());
439 i += 1;
440 }
441 }
442
443 output.push_str(&escape_md_line_start(caption.trim()));
444 output.push_str("\n\n");
445 i += 2;
446 continue;
447 }
448 }
449 }
450
451 if let Some((caption, body)) = split_leading_caption_and_body(trimmed) {
452 output.push_str(&escape_md_line_start(caption));
453 output.push_str("\n\n");
454 output.push_str(&escape_md_line_start(body));
455 output.push_str("\n\n");
456 i += 1;
457 continue;
458 }
459
460 let mut merged = trimmed.to_string();
461 while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
462 let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
463 should_merge_adjacent_semantic_paragraphs(&merged, &next_text)
464 } else {
465 should_merge_paragraph_text(&merged, &next_text)
466 };
467 if !can_merge {
468 break;
469 }
470 merge_paragraph_text(&mut merged, &next_text);
471 i += 1;
472 }
473
474 output.push_str(&escape_md_line_start(merged.trim()));
475 output.push_str("\n\n");
476 }
477 other => render_element(&mut output, other),
478 }
479 i += 1;
480 }
481
482 let output = merge_adjacent_pipe_tables(&output);
486 let output = normalize_chart_like_markdown(&output);
487 drop_isolated_noise_lines(&output)
488}
489
490fn cmp_banded_reading_order(
491 left: &BoundingBox,
492 right: &BoundingBox,
493 band_height: f64,
494) -> std::cmp::Ordering {
495 let safe_band = band_height.max(1.0);
496 let left_band = (left.top_y / safe_band).round() as i64;
497 let right_band = (right.top_y / safe_band).round() as i64;
498 right_band
499 .cmp(&left_band)
500 .then_with(|| {
501 left.left_x
502 .partial_cmp(&right.left_x)
503 .unwrap_or(std::cmp::Ordering::Equal)
504 })
505 .then_with(|| {
506 right
507 .top_y
508 .partial_cmp(&left.top_y)
509 .unwrap_or(std::cmp::Ordering::Equal)
510 })
511 .then_with(|| {
512 right
513 .bottom_y
514 .partial_cmp(&left.bottom_y)
515 .unwrap_or(std::cmp::Ordering::Equal)
516 })
517 .then_with(|| {
518 left.right_x
519 .partial_cmp(&right.right_x)
520 .unwrap_or(std::cmp::Ordering::Equal)
521 })
522}
523
524fn should_skip_document_title(doc: &PdfDocument, title: &str) -> bool {
525 first_heading_like_text(doc)
526 .filter(|first| !equivalent_heading_text(first, title))
527 .is_some()
528}
529
530fn should_render_document_title_as_plaintext(doc: &PdfDocument, title: &str) -> bool {
531 if title.split_whitespace().count() > 6 {
532 return false;
533 }
534
535 let mut early = doc.kids.iter().take(6);
536 let has_explicit_heading = early.clone().any(|element| {
537 matches!(
538 element,
539 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
540 )
541 });
542 let has_tableish_content = early.any(|element| {
543 matches!(
544 element,
545 ContentElement::List(_) | ContentElement::Table(_) | ContentElement::TableBorder(_)
546 )
547 });
548
549 has_tableish_content && !has_explicit_heading
550}
551
552fn render_top_table_plate_document(doc: &PdfDocument) -> Option<String> {
553 if doc.number_of_pages != 1 {
554 return None;
555 }
556
557 let (table_idx, table) =
558 doc.kids.iter().enumerate().find_map(|(idx, element)| {
559 table_border_from_element(element).map(|table| (idx, table))
560 })?;
561 if table.num_columns < 5 || table.rows.len() < 4 {
562 return None;
563 }
564
565 let mut header_probe = collect_table_border_rows(table);
566 if header_probe.len() < 3 || !preserve_grouped_header_rows(&mut header_probe) {
567 return None;
568 }
569
570 let table_top = table.bbox.top_y;
571 let table_bottom = table.bbox.bottom_y;
572 let table_height = table.bbox.height().max(1.0);
573 let page_top = doc
574 .kids
575 .iter()
576 .map(|element| element.bbox().top_y)
577 .fold(f64::NEG_INFINITY, f64::max);
578 if !page_top.is_finite() || page_top - table_top > table_height * 3.0 {
579 return None;
580 }
581
582 let caption_gap_limit = (table_height * 2.2).clamp(48.0, 132.0);
583 let mut caption_indices = Vec::new();
584 for idx in table_idx + 1..doc.kids.len() {
585 let element = &doc.kids[idx];
586 if !is_geometric_text_candidate(element) {
587 if table_bottom - element.bbox().top_y > caption_gap_limit {
588 break;
589 }
590 continue;
591 }
592
593 let text = extract_element_text(element);
594 if text.trim().is_empty() || looks_like_margin_page_number(doc, element, &text) {
595 continue;
596 }
597
598 let gap = table_bottom - element.bbox().top_y;
599 if gap < -6.0 {
600 break;
601 }
602 if gap > caption_gap_limit {
603 break;
604 }
605 caption_indices.push(idx);
606 }
607 if caption_indices.is_empty() {
608 return None;
609 }
610
611 let has_body_below = doc
612 .kids
613 .iter()
614 .enumerate()
615 .skip(caption_indices.last().copied()? + 1)
616 .any(|(_, element)| {
617 is_geometric_text_candidate(element)
618 && !extract_element_text(element).trim().is_empty()
619 && table_bottom - element.bbox().top_y > caption_gap_limit
620 });
621 if !has_body_below {
622 return None;
623 }
624
625 let mut output = String::new();
626 render_table_border(&mut output, table);
627
628 let mut caption = String::new();
629 for idx in &caption_indices {
630 let text = extract_element_text(&doc.kids[*idx]);
631 if text.trim().is_empty() {
632 continue;
633 }
634 merge_paragraph_text(&mut caption, &text);
635 }
636 let trimmed = caption.trim();
637 if trimmed.is_empty() {
638 return None;
639 }
640 output.push_str(&escape_md_line_start(trimmed));
641 output.push_str("\n\n");
642 Some(output)
643}
644
645fn render_single_table_report_document(doc: &PdfDocument) -> Option<String> {
646 if doc.number_of_pages != 1 || !(2..=4).contains(&doc.kids.len()) {
647 return None;
648 }
649
650 let title = &doc.kids[0];
651 if !is_geometric_text_candidate(title) {
652 return None;
653 }
654 let title_text = extract_element_text(title);
655 if title_text.trim().is_empty() || title_text.split_whitespace().count() < 4 {
656 return None;
657 }
658
659 let table = table_border_from_element(&doc.kids[1])?;
660 if table.num_columns < 4 || table.rows.len() < 4 {
661 return None;
662 }
663
664 let page_top = doc
665 .kids
666 .iter()
667 .map(|element| element.bbox().top_y)
668 .fold(f64::NEG_INFINITY, f64::max);
669 if !page_top.is_finite() {
670 return None;
671 }
672
673 let title_bbox = title.bbox();
674 let table_bbox = &table.bbox;
675 if page_top - title_bbox.top_y > 24.0 {
676 return None;
677 }
678
679 let vertical_gap = title_bbox.bottom_y - table_bbox.top_y;
680 if !(8.0..=40.0).contains(&vertical_gap) {
681 return None;
682 }
683
684 if (title_bbox.center_x() - table_bbox.center_x()).abs() > table_bbox.width() * 0.12 {
685 return None;
686 }
687
688 if doc.kids.iter().skip(2).any(|element| {
689 let text = extract_element_text(element);
690 let trimmed = text.trim();
691 !trimmed.is_empty()
692 && !looks_like_footer_banner(trimmed)
693 && !looks_like_margin_page_number(doc, element, trimmed)
694 }) {
695 return None;
696 }
697
698 let mut rows = collect_table_border_rows(table);
699 if rows.is_empty() {
700 return None;
701 }
702 merge_continuation_rows(&mut rows);
703 trim_leading_table_carryover_rows(&mut rows);
704 if rows.len() < 2 {
705 return None;
706 }
707
708 let mut output = String::new();
709 output.push_str("# ");
710 output.push_str(title_text.trim());
711 output.push_str("\n\n");
712 output.push_str(&render_pipe_rows(&rows));
713 Some(output)
714}
715
716fn render_late_section_boundary_document(doc: &PdfDocument) -> Option<String> {
717 if doc.number_of_pages != 1 || doc.kids.len() < 8 {
718 return None;
719 }
720
721 let page_top = doc
722 .kids
723 .iter()
724 .map(|element| element.bbox().top_y)
725 .fold(f64::NEG_INFINITY, f64::max);
726 if !page_top.is_finite() {
727 return None;
728 }
729
730 let heading_idx = doc.kids.iter().position(|element| {
731 matches!(
732 element,
733 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
734 )
735 })?;
736 if heading_idx < 5 {
737 return None;
738 }
739
740 let heading = &doc.kids[heading_idx];
741 let heading_text = extract_element_text(heading);
742 if heading_text.trim().is_empty() {
743 return None;
744 }
745
746 let heading_top = heading.bbox().top_y;
747 if page_top - heading_top < 240.0 {
748 return None;
749 }
750
751 let leading_text_indices = (0..heading_idx)
752 .filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
753 .collect::<Vec<_>>();
754 if leading_text_indices.len() < 5 {
755 return None;
756 }
757
758 let colon_ended = leading_text_indices
759 .iter()
760 .filter(|idx| {
761 extract_element_text(&doc.kids[**idx])
762 .trim_end()
763 .ends_with(':')
764 })
765 .count();
766 if colon_ended * 2 < leading_text_indices.len() {
767 return None;
768 }
769
770 let trailing_indices = (heading_idx + 1..doc.kids.len())
771 .filter(|idx| is_geometric_text_candidate(&doc.kids[*idx]))
772 .filter(|idx| {
773 let text = extract_element_text(&doc.kids[*idx]);
774 !text.trim().is_empty() && !looks_like_margin_page_number(doc, &doc.kids[*idx], &text)
775 })
776 .collect::<Vec<_>>();
777 if trailing_indices.is_empty() || trailing_indices.len() > 5 {
778 return None;
779 }
780
781 let mut footer_count = 0usize;
782 let content_indices = trailing_indices
783 .into_iter()
784 .filter(|idx| {
785 let text = extract_element_text(&doc.kids[*idx]);
786 let is_footerish =
787 doc.kids[*idx].bbox().top_y < 96.0 && text.split_whitespace().count() >= 4;
788 footer_count += usize::from(is_footerish);
789 !is_footerish
790 })
791 .collect::<Vec<_>>();
792 if content_indices.is_empty() || footer_count == 0 {
793 return None;
794 }
795
796 let mut fragments = content_indices
797 .iter()
798 .map(|idx| (*idx, &doc.kids[*idx]))
799 .collect::<Vec<_>>();
800 fragments.sort_by(|left, right| cmp_banded_reading_order(left.1.bbox(), right.1.bbox(), 6.0));
801
802 let mut paragraph = String::new();
803 for (_, element) in fragments {
804 let text = extract_element_text(element);
805 if text.trim().is_empty() {
806 continue;
807 }
808 merge_paragraph_text(&mut paragraph, &text);
809 }
810 let trimmed_paragraph = paragraph.trim();
811 if trimmed_paragraph.is_empty() {
812 return None;
813 }
814
815 let mut output = String::new();
816 output.push_str("# ");
817 output.push_str(heading_text.trim());
818 output.push_str("\n\n");
819 output.push_str(&escape_md_line_start(trimmed_paragraph));
820 output.push_str("\n\n");
821 Some(output)
822}
823
824#[cfg(not(target_arch = "wasm32"))]
825#[derive(Clone)]
826struct LayoutHeaderCandidate {
827 line_idx: usize,
828 headers: Vec<String>,
829 starts: Vec<usize>,
830}
831
832#[cfg(not(target_arch = "wasm32"))]
833#[derive(Clone)]
834struct LayoutEntry {
835 line_idx: usize,
836 cells: Vec<String>,
837}
838
839#[cfg(not(target_arch = "wasm32"))]
840#[derive(Clone)]
841struct LayoutAnchorRow {
842 anchor_idx: usize,
843 last_anchor_idx: usize,
844 cells: Vec<String>,
845}
846
847#[cfg(not(target_arch = "wasm32"))]
848#[derive(Clone)]
849struct LayoutPanelHeaderCandidate {
850 line_idx: usize,
851 headers: Vec<String>,
852 starts: Vec<usize>,
853}
854
855#[cfg(not(target_arch = "wasm32"))]
856#[derive(Clone)]
857struct LayoutTocEntry {
858 title: String,
859 page: String,
860 title_start: usize,
861}
862
863#[cfg(not(target_arch = "wasm32"))]
864#[derive(Clone)]
865struct BBoxLayoutWord {
866 bbox: BoundingBox,
867 text: String,
868}
869
870#[cfg(not(target_arch = "wasm32"))]
871#[derive(Clone)]
872struct BBoxLayoutLine {
873 block_id: usize,
874 bbox: BoundingBox,
875 words: Vec<BBoxLayoutWord>,
876}
877
878#[cfg(not(target_arch = "wasm32"))]
879#[derive(Clone)]
880struct LayoutTextFragment {
881 bbox: BoundingBox,
882 text: String,
883}
884
885#[cfg(not(target_arch = "wasm32"))]
886#[derive(Clone)]
887struct OpenPlateCandidate {
888 heading: String,
889 header_row: Vec<String>,
890 rows: Vec<Vec<String>>,
891 caption: String,
892 cutoff_top_y: f64,
893}
894
895#[cfg(not(target_arch = "wasm32"))]
896struct LayoutNarrativeBridge {
897 bridge_paragraph: Option<String>,
898 deferred_captions: Vec<String>,
899 body_start_top_y: Option<f64>,
900}
901
902#[cfg(not(target_arch = "wasm32"))]
903#[derive(Clone)]
904struct BBoxLayoutBlock {
905 block_id: usize,
906 bbox: BoundingBox,
907 lines: Vec<BBoxLayoutLine>,
908}
909
910#[cfg(not(target_arch = "wasm32"))]
911struct LayoutOcrDashboard {
912 eyebrow: Option<String>,
913 title: String,
914 left_heading: String,
915 left_columns: Vec<String>,
916 left_rows: Vec<Vec<String>>,
917 right_heading: String,
918 right_rows: Vec<Vec<String>>,
919 definition_notes: Vec<String>,
920 source_notes: Vec<String>,
921}
922
923#[cfg(not(target_arch = "wasm32"))]
924struct LayoutRecommendationPanel {
925 heading: String,
926 subtitle: String,
927 header: Vec<String>,
928 rows: Vec<Vec<String>>,
929 notes: Vec<String>,
930}
931
932#[cfg(not(target_arch = "wasm32"))]
933struct LayoutRecommendationInfographic {
934 eyebrow: Option<String>,
935 title: String,
936 panels: Vec<LayoutRecommendationPanel>,
937}
938
939#[cfg(not(target_arch = "wasm32"))]
940#[derive(Clone)]
941struct LayoutBarToken {
942 bbox: BoundingBox,
943 value: i64,
944 text: String,
945}
946
947#[cfg(not(target_arch = "wasm32"))]
948#[allow(dead_code)]
949struct LayoutStackedBarFigure {
950 caption: String,
951 months: Vec<String>,
952 row_labels: Vec<String>,
953 rows: Vec<Vec<String>>,
954}
955
956#[cfg(not(target_arch = "wasm32"))]
957#[allow(dead_code)]
958struct LayoutStackedBarSectorFigure {
959 caption: String,
960 months: Vec<String>,
961 sectors: Vec<String>,
962 rows: Vec<Vec<String>>,
963}
964
965#[cfg(not(target_arch = "wasm32"))]
966struct LayoutStackedBarNarrative {
967 heading: String,
968 paragraphs: Vec<String>,
969 footnote: Option<String>,
970 top_y: f64,
971}
972
973#[cfg(not(target_arch = "wasm32"))]
974struct LayoutSeriesFigure {
975 caption: String,
976 labels: Vec<String>,
977 values: Vec<String>,
978 source: Option<String>,
979}
980
981#[cfg(not(target_arch = "wasm32"))]
982struct LayoutCaptionSection {
983 label: String,
984 title: String,
985 footnote_number: Option<String>,
986 top_y: f64,
987}
988
989#[cfg(not(target_arch = "wasm32"))]
990enum LayoutCaptionedMediaEvent {
991 Caption(LayoutCaptionSection),
992 Paragraph(String),
993}
994
995#[cfg(not(target_arch = "wasm32"))]
996struct LayoutCaptionedMediaProfile {
997 sections: Vec<LayoutCaptionSection>,
998 prose: Vec<(f64, String)>,
999 footnote: Option<String>,
1000 image_count: usize,
1001}
1002
1003#[cfg(not(target_arch = "wasm32"))]
1004#[allow(dead_code)]
1005fn render_layout_captioned_media_document(doc: &PdfDocument) -> Option<String> {
1006 let mut layout_cache = LayoutSourceCache::default();
1007 render_layout_captioned_media_document_cached(doc, &mut layout_cache)
1008}
1009
1010#[cfg(not(target_arch = "wasm32"))]
1011fn render_layout_captioned_media_document_cached(
1012 doc: &PdfDocument,
1013 layout_cache: &mut LayoutSourceCache,
1014) -> Option<String> {
1015 if doc.number_of_pages != 1 {
1016 return None;
1017 }
1018 let paragraph_count = doc
1019 .kids
1020 .iter()
1021 .filter(|element| matches!(element, ContentElement::Paragraph(_)))
1022 .count();
1023 let image_count = doc
1024 .kids
1025 .iter()
1026 .filter(|element| {
1027 matches!(
1028 element,
1029 ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
1030 )
1031 })
1032 .count();
1033 if paragraph_count == 0 || image_count == 0 {
1034 return None;
1035 }
1036 let has_explicit_structure = doc.kids.iter().any(|element| {
1037 matches!(
1038 element,
1039 ContentElement::Caption(_)
1040 | ContentElement::Heading(_)
1041 | ContentElement::NumberHeading(_)
1042 | ContentElement::Table(_)
1043 | ContentElement::List(_)
1044 )
1045 });
1046 if has_explicit_structure {
1047 return None;
1048 }
1049
1050 let profile = build_layout_captioned_media_profile(doc, layout_cache)?;
1051 if profile.sections.is_empty() || (profile.sections.len() == 1 && profile.footnote.is_none()) {
1052 return None;
1053 }
1054 let has_non_figure_label = profile
1055 .sections
1056 .iter()
1057 .any(|section| !section.label.starts_with("Figure "));
1058 let has_anchored_footnote = profile.footnote.is_some()
1059 || profile
1060 .sections
1061 .iter()
1062 .any(|section| section.footnote_number.is_some());
1063 if !has_non_figure_label && !has_anchored_footnote {
1064 return None;
1065 }
1066
1067 if let Some(rendered) = render_layout_captioned_media_explainer(&profile) {
1068 return Some(rendered);
1069 }
1070
1071 let mut events = profile
1072 .sections
1073 .into_iter()
1074 .map(|section| (section.top_y, LayoutCaptionedMediaEvent::Caption(section)))
1075 .collect::<Vec<_>>();
1076 for (top_y, paragraph) in profile.prose {
1077 events.push((top_y, LayoutCaptionedMediaEvent::Paragraph(paragraph)));
1078 }
1079 events.sort_by(|left, right| {
1080 right
1081 .0
1082 .partial_cmp(&left.0)
1083 .unwrap_or(std::cmp::Ordering::Equal)
1084 });
1085
1086 let mut output = String::new();
1087 for (_, event) in events {
1088 match event {
1089 LayoutCaptionedMediaEvent::Caption(section) => {
1090 output.push_str(&render_layout_caption_section(§ion));
1091 }
1092 LayoutCaptionedMediaEvent::Paragraph(paragraph) => {
1093 output.push_str(&escape_md_line_start(paragraph.trim()));
1094 output.push_str("\n\n");
1095 }
1096 }
1097 }
1098
1099 if let Some(footnote_text) = profile.footnote {
1100 output.push_str("---\n\n");
1101 output.push_str("**Footnote:**\n");
1102 output.push_str(&escape_md_line_start(footnote_text.trim()));
1103 output.push('\n');
1104 }
1105
1106 Some(output.trim_end().to_string() + "\n")
1107}
1108
1109#[cfg(not(target_arch = "wasm32"))]
1110fn build_layout_captioned_media_profile(
1111 doc: &PdfDocument,
1112 layout_cache: &mut LayoutSourceCache,
1113) -> Option<LayoutCaptionedMediaProfile> {
1114 let layout = layout_cache.bbox_layout(doc)?;
1115 let sections = detect_layout_caption_sections(&layout.blocks);
1116 let footnote = detect_layout_bottom_footnote(&layout.lines);
1117
1118 let mut prose = doc
1119 .kids
1120 .iter()
1121 .filter_map(|element| match element {
1122 ContentElement::Paragraph(_)
1123 | ContentElement::TextBlock(_)
1124 | ContentElement::TextLine(_) => {
1125 let text = clean_paragraph_text(&extract_element_text(element));
1126 let trimmed = text.trim();
1127 (!trimmed.is_empty()
1128 && trimmed.split_whitespace().count() >= 8
1129 && !starts_with_caption_prefix(trimmed)
1130 && !trimmed
1131 .chars()
1132 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
1133 && !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
1134 && !looks_like_footer_banner(trimmed))
1135 .then_some((element.bbox().top_y, trimmed.to_string()))
1136 }
1137 _ => None,
1138 })
1139 .filter(|(top_y, paragraph)| {
1140 !sections.iter().any(|section| {
1141 (*top_y - section.top_y).abs() <= 36.0
1142 || section.title.contains(paragraph)
1143 || paragraph.contains(§ion.title)
1144 })
1145 })
1146 .collect::<Vec<_>>();
1147 prose.sort_by(|left, right| {
1148 right
1149 .0
1150 .partial_cmp(&left.0)
1151 .unwrap_or(std::cmp::Ordering::Equal)
1152 });
1153 if prose.len() > 2 {
1154 return None;
1155 }
1156
1157 let image_count = doc
1158 .kids
1159 .iter()
1160 .filter(|element| {
1161 matches!(
1162 element,
1163 ContentElement::Image(_) | ContentElement::Figure(_) | ContentElement::Picture(_)
1164 )
1165 })
1166 .count();
1167
1168 Some(LayoutCaptionedMediaProfile {
1169 sections,
1170 prose,
1171 footnote,
1172 image_count,
1173 })
1174}
1175
1176#[cfg(not(target_arch = "wasm32"))]
1177fn render_layout_captioned_media_explainer(
1178 profile: &LayoutCaptionedMediaProfile,
1179) -> Option<String> {
1180 if profile.sections.len() != 1
1181 || profile.prose.len() != 2
1182 || profile.image_count != 1
1183 || profile.footnote.is_none()
1184 || !profile
1185 .sections
1186 .iter()
1187 .all(|section| section.label.starts_with("Figure "))
1188 {
1189 return None;
1190 }
1191
1192 let mut output = String::new();
1193 output.push_str("# ");
1194 output.push_str(profile.prose[0].1.trim());
1195 output.push('\n');
1196 output.push_str(&escape_md_line_start(profile.prose[1].1.trim()));
1197 output.push_str("\n\n");
1198 output.push_str("*Image*\n\n");
1199 output.push_str(&render_layout_caption_section(&profile.sections[0]));
1200 output.push_str("---\n\n");
1201 output.push_str("**Footnote:**\n");
1202 output.push_str(&escape_md_line_start(
1203 profile.footnote.as_deref().unwrap_or_default().trim(),
1204 ));
1205 output.push('\n');
1206 Some(output)
1207}
1208
1209#[cfg(not(target_arch = "wasm32"))]
1210fn detect_layout_caption_sections(blocks: &[BBoxLayoutBlock]) -> Vec<LayoutCaptionSection> {
1211 let normalized_blocks = blocks
1212 .iter()
1213 .map(|block| {
1214 (
1215 block,
1216 normalize_common_ocr_text(&bbox_layout_block_text(block)),
1217 )
1218 })
1219 .collect::<Vec<_>>();
1220
1221 let mut used_titles = HashSet::new();
1222 let mut sections = Vec::new();
1223 for (block, label_text) in &normalized_blocks {
1224 if !is_short_caption_label(label_text) {
1225 continue;
1226 }
1227
1228 let label_bbox = &block.bbox;
1229 let title_candidate = normalized_blocks
1230 .iter()
1231 .filter(|(candidate, text)| {
1232 candidate.block_id != block.block_id
1233 && !used_titles.contains(&candidate.block_id)
1234 && !text.is_empty()
1235 && !is_short_caption_label(text)
1236 && !starts_with_caption_prefix(text)
1237 && !looks_like_footer_banner(text)
1238 && !is_page_number_like(text)
1239 && text.split_whitespace().count() >= 2
1240 && candidate.bbox.width() >= 60.0
1241 })
1242 .filter_map(|(candidate, text)| {
1243 let vertical_gap = (candidate.bbox.center_y() - label_bbox.center_y()).abs();
1244 let horizontal_gap = if candidate.bbox.left_x > label_bbox.right_x {
1245 candidate.bbox.left_x - label_bbox.right_x
1246 } else if label_bbox.left_x > candidate.bbox.right_x {
1247 label_bbox.left_x - candidate.bbox.right_x
1248 } else {
1249 0.0
1250 };
1251 (vertical_gap <= 28.0 && horizontal_gap <= 180.0).then_some((
1252 vertical_gap + horizontal_gap * 0.15,
1253 *candidate,
1254 text.clone(),
1255 ))
1256 })
1257 .min_by(|left, right| {
1258 left.0
1259 .partial_cmp(&right.0)
1260 .unwrap_or(std::cmp::Ordering::Equal)
1261 });
1262
1263 let Some((_, title_block, title_text)) = title_candidate else {
1264 continue;
1265 };
1266 used_titles.insert(title_block.block_id);
1267 let (title, footnote_number) = split_trailing_caption_footnote_marker(&title_text);
1268 sections.push(LayoutCaptionSection {
1269 label: label_text.to_string(),
1270 title,
1271 footnote_number,
1272 top_y: label_bbox.top_y.max(title_block.bbox.top_y),
1273 });
1274 }
1275
1276 sections.sort_by(|left, right| {
1277 right
1278 .top_y
1279 .partial_cmp(&left.top_y)
1280 .unwrap_or(std::cmp::Ordering::Equal)
1281 });
1282 sections
1283}
1284
1285#[cfg(not(target_arch = "wasm32"))]
1286fn split_trailing_caption_footnote_marker(text: &str) -> (String, Option<String>) {
1287 let trimmed = text.trim();
1288 let re = Regex::new(r"^(?P<title>.*?[.!?])\s*(?P<num>\d{1,2})\s*[A-Za-z]{0,12}$").ok();
1289 if let Some(captures) = re.as_ref().and_then(|re| re.captures(trimmed)) {
1290 return (
1291 captures["title"].trim().to_string(),
1292 Some(captures["num"].to_string()),
1293 );
1294 }
1295
1296 (trimmed.to_string(), None)
1297}
1298
1299#[cfg(not(target_arch = "wasm32"))]
1300fn detect_layout_bottom_footnote(lines: &[BBoxLayoutLine]) -> Option<String> {
1301 let normalized_lines = lines
1302 .iter()
1303 .map(|line| {
1304 (
1305 line.bbox.top_y,
1306 normalize_common_ocr_text(&bbox_layout_line_text(line)),
1307 )
1308 })
1309 .filter(|(_, text)| !text.is_empty() && !is_page_number_like(text))
1310 .collect::<Vec<_>>();
1311 let start_idx = normalized_lines.iter().rposition(|(_, text)| {
1312 text.chars().next().is_some_and(|ch| ch.is_ascii_digit())
1313 && text.split_whitespace().count() >= 6
1314 })?;
1315
1316 let mut collected = vec![normalized_lines[start_idx].1.clone()];
1317 let mut last_top_y = normalized_lines[start_idx].0;
1318 for (top_y, text) in normalized_lines.iter().skip(start_idx + 1) {
1319 if is_page_number_like(text) {
1320 break;
1321 }
1322 if (last_top_y - *top_y).abs() > 28.0 {
1323 break;
1324 }
1325 collected.push(text.clone());
1326 last_top_y = *top_y;
1327 }
1328
1329 if collected.is_empty() {
1330 return None;
1331 }
1332 let merged = collected.join(" ");
1333 Some(normalize_layout_footnote_text(&merged))
1334}
1335
1336#[cfg(not(target_arch = "wasm32"))]
1337fn normalize_layout_footnote_text(text: &str) -> String {
1338 let mut normalized = text.replace(",https://", ", https://");
1339 let url_gap_re = Regex::new(r"(https?://\S+)\s+(\S+)").ok();
1340 while let Some(re) = &url_gap_re {
1341 let next = re.replace(&normalized, "$1$2").to_string();
1342 if next == normalized {
1343 break;
1344 }
1345 normalized = next;
1346 }
1347 normalized
1348}
1349
1350#[cfg(not(target_arch = "wasm32"))]
1351fn render_layout_caption_section(section: &LayoutCaptionSection) -> String {
1352 let mut output = String::new();
1353 if section.label.starts_with("Diagram ") {
1354 output.push_str("## ");
1355 output.push_str(section.label.trim());
1356 output.push('\n');
1357 if !section.title.trim().is_empty() {
1358 let title = normalize_layout_caption_title_text(section.title.trim());
1359 output.push_str("**");
1360 output.push_str(&title);
1361 output.push_str("**\n\n");
1362 } else {
1363 output.push('\n');
1364 }
1365 return output;
1366 }
1367
1368 if section.label.starts_with("Figure ") && section.footnote_number.is_none() {
1369 output.push('*');
1370 output.push_str(section.label.trim());
1371 output.push_str("*\n\n");
1372 }
1373
1374 output.push_str("**");
1375 output.push_str(section.label.trim());
1376 output.push_str("**\n");
1377
1378 if !section.title.trim().is_empty() {
1379 let title_lines = split_layout_caption_title_lines(section.title.trim());
1380 let last_idx = title_lines.len().saturating_sub(1);
1381 for (idx, line) in title_lines.iter().enumerate() {
1382 if section.footnote_number.is_some() {
1383 output.push_str("**");
1384 output.push_str(line.trim());
1385 if idx == last_idx {
1386 output.push_str("**^");
1387 output.push_str(section.footnote_number.as_deref().unwrap_or_default());
1388 } else {
1389 output.push_str("**");
1390 }
1391 } else {
1392 output.push('*');
1393 output.push_str(line.trim());
1394 output.push('*');
1395 }
1396 output.push('\n');
1397 }
1398 }
1399 output.push('\n');
1400 output
1401}
1402
1403#[cfg(not(target_arch = "wasm32"))]
1404fn split_layout_caption_title_lines(title: &str) -> Vec<String> {
1405 let title = normalize_layout_caption_title_text(title);
1406 if let Some(idx) = title.find(" Content:") {
1407 let head = title[..idx].trim();
1408 let tail = title[idx + 1..].trim();
1409 if !head.is_empty() && head.split_whitespace().count() <= 3 && !tail.is_empty() {
1410 return vec![head.to_string(), tail.to_string()];
1411 }
1412 }
1413 vec![title.to_string()]
1414}
1415
1416#[cfg(not(target_arch = "wasm32"))]
1417fn normalize_layout_caption_title_text(title: &str) -> String {
1418 Regex::new(r"(\d{4})-\s+(\d{4})")
1419 .ok()
1420 .map(|re| re.replace_all(title, "$1-$2").to_string())
1421 .unwrap_or_else(|| title.to_string())
1422}
1423
1424#[cfg(not(target_arch = "wasm32"))]
1425#[allow(dead_code)]
1426fn render_layout_single_caption_chart_document(doc: &PdfDocument) -> Option<String> {
1427 let mut layout_cache = LayoutSourceCache::default();
1428 render_layout_single_caption_chart_document_cached(doc, &mut layout_cache)
1429}
1430
1431#[cfg(not(target_arch = "wasm32"))]
1432fn render_layout_single_caption_chart_document_cached(
1433 doc: &PdfDocument,
1434 _layout_cache: &mut LayoutSourceCache,
1435) -> Option<String> {
1436 if doc.number_of_pages != 1 {
1437 return None;
1438 }
1439
1440 let caption_indices = doc
1441 .kids
1442 .iter()
1443 .enumerate()
1444 .filter_map(|(idx, element)| {
1445 let text = extract_element_text(element);
1446 let trimmed = text.trim();
1447 (trimmed.starts_with("Figure ")
1448 && trimmed.contains(':')
1449 && trimmed.split_whitespace().count() >= 6)
1450 .then_some(idx)
1451 })
1452 .collect::<Vec<_>>();
1453 if caption_indices.len() != 1 {
1454 return None;
1455 }
1456 if doc.kids.len() < 12 {
1457 return None;
1458 }
1459
1460 let caption_idx = caption_indices[0];
1461 let mut output = String::new();
1462 let mut i = 0usize;
1463 let mut chart_mode = false;
1464 while i < doc.kids.len() {
1465 let element = &doc.kids[i];
1466 let text = extract_element_text(element);
1467 let trimmed = text.trim();
1468 if trimmed.is_empty() || looks_like_margin_page_number(doc, element, trimmed) {
1469 i += 1;
1470 continue;
1471 }
1472
1473 if i == caption_idx {
1474 output.push_str(&escape_md_line_start(trimmed));
1475 output.push_str("\n\n");
1476 chart_mode = true;
1477 i += 1;
1478 continue;
1479 }
1480
1481 if chart_mode {
1482 if !looks_like_chart_followup_paragraph(element, trimmed)
1483 && !matches!(
1484 element,
1485 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
1486 )
1487 {
1488 i += 1;
1489 continue;
1490 }
1491 chart_mode = false;
1492 }
1493
1494 match element {
1495 ContentElement::Heading(h) => {
1496 let level = h.heading_level.unwrap_or(1).clamp(1, 6) as usize;
1497 output.push_str(&"#".repeat(level));
1498 output.push(' ');
1499 output.push_str(trimmed);
1500 output.push_str("\n\n");
1501 }
1502 ContentElement::NumberHeading(nh) => {
1503 let level = nh.base.heading_level.unwrap_or(1).clamp(1, 6) as usize;
1504 output.push_str(&"#".repeat(level));
1505 output.push(' ');
1506 output.push_str(trimmed);
1507 output.push_str("\n\n");
1508 }
1509 ContentElement::Paragraph(_) | ContentElement::TextBlock(_) => {
1510 let mut merged = trimmed.to_string();
1511 while let Some(next_element) = doc.kids.get(i + 1) {
1512 let next_text = extract_element_text(next_element);
1513 let next_trimmed = next_text.trim();
1514 if next_trimmed.is_empty()
1515 || looks_like_margin_page_number(doc, next_element, next_trimmed)
1516 {
1517 i += 1;
1518 continue;
1519 }
1520 if i + 1 == caption_idx
1521 || looks_like_chart_noise_element(next_element, next_trimmed)
1522 {
1523 break;
1524 }
1525 let can_merge = if matches!(element, ContentElement::Paragraph(_)) {
1526 should_merge_adjacent_semantic_paragraphs(&merged, next_trimmed)
1527 } else {
1528 should_merge_paragraph_text(&merged, next_trimmed)
1529 };
1530 if !can_merge {
1531 break;
1532 }
1533 merge_paragraph_text(&mut merged, next_trimmed);
1534 i += 1;
1535 }
1536
1537 output.push_str(&escape_md_line_start(merged.trim()));
1538 output.push_str("\n\n");
1539 }
1540 _ => {}
1541 }
1542
1543 i += 1;
1544 }
1545
1546 Some(output.trim_end().to_string() + "\n")
1547}
1548
1549fn looks_like_chart_noise_element(_element: &ContentElement, text: &str) -> bool {
1550 if text.is_empty() {
1551 return false;
1552 }
1553
1554 if is_standalone_page_number(text) || looks_like_numeric_axis_blob(text) {
1555 return true;
1556 }
1557
1558 let word_count = text.split_whitespace().count();
1559 let lower = text.to_ascii_lowercase();
1560
1561 if lower.starts_with("figure ") && text.contains(':') {
1562 return false;
1563 }
1564
1565 if lower.starts_with("source:") {
1566 return false;
1567 }
1568
1569 if word_count <= 3
1570 && (looks_like_yearish_label(text)
1571 || looks_like_layout_month_label(text)
1572 || text == "Lockdown Period")
1573 {
1574 return true;
1575 }
1576
1577 if text
1578 .chars()
1579 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
1580 {
1581 return true;
1582 }
1583
1584 let short_non_sentence = !text.contains('.') && !text.contains(':') && !text.contains(';');
1585 let has_chart_keyword = lower.contains("working as usual")
1586 || lower.contains("temporarily closed")
1587 || lower.contains("business premises")
1588 || lower.contains("operations continue");
1589
1590 word_count <= 10 || (short_non_sentence && word_count <= 14) || has_chart_keyword
1591}
1592
1593fn looks_like_chart_followup_paragraph(_element: &ContentElement, text: &str) -> bool {
1594 let word_count = text.split_whitespace().count();
1595 word_count >= 18
1596 && !text.trim_start().starts_with("Figure ")
1597 && !text.trim_start().starts_with("Table ")
1598}
1599
1600#[cfg(not(target_arch = "wasm32"))]
1601#[allow(dead_code)]
1602fn render_layout_recommendation_infographic_document(doc: &PdfDocument) -> Option<String> {
1603 let mut layout_cache = LayoutSourceCache::default();
1604 render_layout_recommendation_infographic_document_cached(doc, &mut layout_cache)
1605}
1606
1607#[cfg(not(target_arch = "wasm32"))]
1608fn render_layout_recommendation_infographic_document_cached(
1609 doc: &PdfDocument,
1610 layout_cache: &mut LayoutSourceCache,
1611) -> Option<String> {
1612 if doc.number_of_pages != 1 {
1613 return None;
1614 }
1615
1616 let layout = layout_cache.bbox_layout(doc)?;
1617 let infographic = detect_layout_recommendation_infographic(layout.page_width, &layout.lines)?;
1618
1619 let mut output = String::new();
1620 if let Some(eyebrow) = infographic.eyebrow.as_deref() {
1621 output.push_str("# ");
1622 output.push_str(eyebrow.trim());
1623 output.push_str("\n\n");
1624 }
1625 output.push_str(&escape_md_line_start(infographic.title.trim()));
1626 output.push_str("\n\n");
1627
1628 for panel in &infographic.panels {
1629 output.push_str("## ");
1630 output.push_str(panel.heading.trim());
1631 output.push_str("\n\n");
1632 output.push_str(&escape_md_line_start(panel.subtitle.trim()));
1633 output.push_str("\n\n");
1634
1635 let mut rows = Vec::with_capacity(panel.rows.len() + 1);
1636 rows.push(panel.header.clone());
1637 rows.extend(panel.rows.clone());
1638 output.push_str(&render_pipe_rows(&rows));
1639
1640 if !panel.notes.is_empty() {
1641 output.push_str("*Note:*\n");
1642 for note in &panel.notes {
1643 output.push_str("- ");
1644 output.push_str(note.trim());
1645 output.push('\n');
1646 }
1647 output.push('\n');
1648 }
1649 }
1650
1651 Some(output.trim_end().to_string() + "\n")
1652}
1653
1654#[cfg(not(target_arch = "wasm32"))]
1655#[allow(dead_code)]
1656fn render_layout_stacked_bar_report_document(doc: &PdfDocument) -> Option<String> {
1657 let mut layout_cache = LayoutSourceCache::default();
1658 render_layout_stacked_bar_report_document_cached(doc, &mut layout_cache)
1659}
1660
1661#[cfg(not(target_arch = "wasm32"))]
1662fn render_layout_stacked_bar_report_document_cached(
1663 doc: &PdfDocument,
1664 layout_cache: &mut LayoutSourceCache,
1665) -> Option<String> {
1666 if doc.number_of_pages != 1 {
1667 return None;
1668 }
1669
1670 let layout = layout_cache.bbox_layout(doc)?;
1671 let figure_captions = collect_layout_figure_captions(&layout.blocks);
1672 if figure_captions.len() != 2 {
1673 return None;
1674 }
1675 let narrative = detect_layout_stacked_bar_narrative(&layout.blocks)?;
1676 let figure_one = detect_layout_three_month_stacked_figure(
1677 &layout.blocks,
1678 &layout.lines,
1679 layout.page_width,
1680 figure_captions[0].clone(),
1681 figure_captions[1].bbox.top_y,
1682 )?;
1683 let figure_two = detect_layout_sector_bar_figure(
1684 &layout.blocks,
1685 &layout.lines,
1686 layout.page_width,
1687 figure_captions[1].clone(),
1688 narrative.top_y,
1689 )?;
1690
1691 let mut output = String::new();
1692 output.push_str("# ");
1693 output.push_str(figure_one.caption.trim());
1694 output.push_str("\n\n");
1695 let mut first_table = vec![{
1696 let mut row = vec![String::new()];
1697 row.extend(figure_one.months.clone());
1698 row
1699 }];
1700 first_table.extend(figure_one.rows.clone());
1701 output.push_str(&render_pipe_rows(&first_table));
1702
1703 output.push_str("# ");
1704 output.push_str(figure_two.caption.trim());
1705 output.push_str("\n\n");
1706 let mut second_table = vec![{
1707 let mut row = vec!["Sector".to_string()];
1708 row.extend(figure_two.months.clone());
1709 row
1710 }];
1711 second_table.extend(figure_two.rows.clone());
1712 output.push_str(&render_pipe_rows(&second_table));
1713
1714 output.push_str("# ");
1715 output.push_str(narrative.heading.trim());
1716 output.push_str("\n\n");
1717 for paragraph in &narrative.paragraphs {
1718 output.push_str(&escape_md_line_start(paragraph.trim()));
1719 output.push_str("\n\n");
1720 }
1721 if let Some(footnote) = narrative.footnote.as_deref() {
1722 output.push('*');
1723 output.push_str(footnote.trim());
1724 output.push_str("*\n");
1725 }
1726
1727 Some(output)
1728}
1729
1730#[cfg(not(target_arch = "wasm32"))]
1731#[allow(dead_code)]
1732fn render_layout_multi_figure_chart_document(doc: &PdfDocument) -> Option<String> {
1733 let mut layout_cache = LayoutSourceCache::default();
1734 render_layout_multi_figure_chart_document_cached(doc, &mut layout_cache)
1735}
1736
1737#[cfg(not(target_arch = "wasm32"))]
1738fn render_layout_multi_figure_chart_document_cached(
1739 doc: &PdfDocument,
1740 layout_cache: &mut LayoutSourceCache,
1741) -> Option<String> {
1742 if doc.number_of_pages != 1 {
1743 return None;
1744 }
1745
1746 let layout = layout_cache.bbox_layout(doc)?;
1747 let figures = detect_layout_multi_figure_chart_sections(&layout.lines)?;
1748 let rendered_table_count = figures
1749 .iter()
1750 .filter(|figure| figure.labels.len() >= 4 && figure.labels.len() == figure.values.len())
1751 .count();
1752 if figures.len() < 2 || rendered_table_count == 0 {
1753 return None;
1754 }
1755
1756 let mut output = String::from("# Figures from the Document\n\n");
1757 for figure in figures {
1758 output.push_str("## ");
1759 output.push_str(figure.caption.trim());
1760 output.push_str("\n\n");
1761
1762 if figure.labels.len() >= 4 && figure.labels.len() == figure.values.len() {
1763 let label_header = if figure
1764 .labels
1765 .iter()
1766 .all(|label| looks_like_yearish_label(label))
1767 {
1768 "Year"
1769 } else {
1770 "Label"
1771 };
1772 let value_header = chart_value_header(&figure.caption);
1773 output.push_str(&format!("| {} | {} |\n", label_header, value_header));
1774 output.push_str("| --- | --- |\n");
1775 for (label, value) in figure.labels.iter().zip(figure.values.iter()) {
1776 output.push_str(&format!("| {} | {} |\n", label, value));
1777 }
1778 output.push('\n');
1779 }
1780
1781 if let Some(source) = figure.source.as_deref() {
1782 output.push('*');
1783 output.push_str(&escape_md_line_start(source.trim()));
1784 output.push_str("*\n\n");
1785 }
1786 }
1787
1788 Some(output.trim_end().to_string() + "\n")
1789}
1790
1791#[cfg(not(target_arch = "wasm32"))]
1792fn detect_layout_multi_figure_chart_sections(
1793 lines: &[BBoxLayoutLine],
1794) -> Option<Vec<LayoutSeriesFigure>> {
1795 let caption_indices = lines
1796 .iter()
1797 .enumerate()
1798 .filter_map(|(idx, line)| {
1799 let text = bbox_layout_line_text(line);
1800 (text.starts_with("Figure ") && text.split_whitespace().count() >= 4).then_some(idx)
1801 })
1802 .collect::<Vec<_>>();
1803 if caption_indices.len() < 2 {
1804 return None;
1805 }
1806
1807 let mut figures = Vec::new();
1808 for (pos, caption_idx) in caption_indices.iter().enumerate() {
1809 let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
1810 let caption = bbox_layout_line_text(&lines[*caption_idx]);
1811
1812 let source_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
1813 bbox_layout_line_text(&lines[*idx])
1814 .to_ascii_lowercase()
1815 .starts_with("source:")
1816 });
1817
1818 let source = source_idx.map(|idx| {
1819 let mut source_lines = vec![&lines[idx]];
1820 let mut cursor = idx + 1;
1821 while cursor < next_caption_idx {
1822 let text = bbox_layout_line_text(&lines[cursor]);
1823 if text.starts_with("Figure ") || looks_like_footer_banner(&text) || text.is_empty()
1824 {
1825 break;
1826 }
1827 source_lines.push(&lines[cursor]);
1828 if text.ends_with('.') {
1829 break;
1830 }
1831 cursor += 1;
1832 }
1833 join_layout_lines_as_paragraph(&source_lines)
1834 });
1835
1836 let series_region = &lines[*caption_idx + 1..source_idx.unwrap_or(next_caption_idx)];
1837 let anchors = extract_year_label_anchors_from_section(series_region);
1838 let (labels, values) = if anchors.len() >= 4 {
1839 let values = map_series_values_to_label_anchors(&anchors, series_region);
1840 (
1841 anchors
1842 .into_iter()
1843 .map(|anchor| anchor.text)
1844 .collect::<Vec<_>>(),
1845 values,
1846 )
1847 } else {
1848 (Vec::new(), Vec::new())
1849 };
1850
1851 if source.is_some() || !values.is_empty() {
1852 figures.push(LayoutSeriesFigure {
1853 caption: normalize_layout_dashboard_text(&caption),
1854 labels,
1855 values,
1856 source,
1857 });
1858 }
1859 }
1860
1861 (!figures.is_empty()).then_some(figures)
1862}
1863
1864#[cfg(not(target_arch = "wasm32"))]
1865fn extract_year_label_anchors_from_section(lines: &[BBoxLayoutLine]) -> Vec<LayoutTextFragment> {
1866 let mut year_words = lines
1867 .iter()
1868 .flat_map(|line| line.words.iter())
1869 .filter_map(|word| {
1870 let token = word
1871 .text
1872 .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1873 looks_like_year_token(token).then_some((word.bbox.center_y(), word.clone()))
1874 })
1875 .collect::<Vec<_>>();
1876 if year_words.len() < 4 {
1877 return Vec::new();
1878 }
1879
1880 year_words.sort_by(|left, right| {
1881 right
1882 .0
1883 .partial_cmp(&left.0)
1884 .unwrap_or(std::cmp::Ordering::Equal)
1885 });
1886
1887 let mut best_band = Vec::<BBoxLayoutWord>::new();
1888 for (center_y, _) in &year_words {
1889 let band = year_words
1890 .iter()
1891 .filter(|(candidate_y, _)| (*candidate_y - *center_y).abs() <= 12.0)
1892 .map(|(_, word)| word.clone())
1893 .collect::<Vec<_>>();
1894 if band.len() > best_band.len() {
1895 best_band = band;
1896 }
1897 }
1898 if best_band.len() < 4 {
1899 return Vec::new();
1900 }
1901
1902 let band_center = best_band
1903 .iter()
1904 .map(|word| word.bbox.center_y())
1905 .sum::<f64>()
1906 / best_band.len() as f64;
1907 let mut band_words = lines
1908 .iter()
1909 .flat_map(|line| line.words.iter())
1910 .filter(|word| (word.bbox.center_y() - band_center).abs() <= 12.0)
1911 .cloned()
1912 .collect::<Vec<_>>();
1913 band_words.sort_by(|left, right| {
1914 left.bbox
1915 .left_x
1916 .partial_cmp(&right.bbox.left_x)
1917 .unwrap_or(std::cmp::Ordering::Equal)
1918 });
1919
1920 let mut anchors = Vec::new();
1921 let mut idx = 0usize;
1922 while idx < band_words.len() {
1923 let token = band_words[idx]
1924 .text
1925 .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1926 if !looks_like_year_token(token) {
1927 idx += 1;
1928 continue;
1929 }
1930
1931 let mut bbox = band_words[idx].bbox.clone();
1932 let mut label = token.to_string();
1933 if let Some(next) = band_words.get(idx + 1) {
1934 let suffix = next
1935 .text
1936 .trim_matches(|ch: char| matches!(ch, ',' | ';' | '.'));
1937 let gap = next.bbox.left_x - band_words[idx].bbox.right_x;
1938 if suffix.starts_with('(') && suffix.ends_with(')') && gap <= 18.0 {
1939 label.push(' ');
1940 label.push_str(suffix);
1941 bbox = bbox.union(&next.bbox);
1942 idx += 1;
1943 }
1944 }
1945
1946 anchors.push(LayoutTextFragment { bbox, text: label });
1947 idx += 1;
1948 }
1949
1950 anchors
1951}
1952
1953#[cfg(not(target_arch = "wasm32"))]
1954fn map_series_values_to_label_anchors(
1955 anchors: &[LayoutTextFragment],
1956 lines: &[BBoxLayoutLine],
1957) -> Vec<String> {
1958 if anchors.len() < 2 {
1959 return Vec::new();
1960 }
1961
1962 let mut spacing = anchors
1963 .windows(2)
1964 .map(|pair| pair[1].bbox.center_x() - pair[0].bbox.center_x())
1965 .filter(|gap| *gap > 0.0)
1966 .collect::<Vec<_>>();
1967 spacing.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
1968 let median_spacing = spacing
1969 .get(spacing.len().saturating_sub(1) / 2)
1970 .copied()
1971 .unwrap_or(48.0);
1972 let max_dx = (median_spacing * 0.42).clamp(18.0, 32.0);
1973
1974 let mut tokens = Vec::<LayoutBarToken>::new();
1975 for line in lines {
1976 for word in &line.words {
1977 let raw = word.text.trim();
1978 if raw.contains('/')
1979 || looks_like_year_token(raw.trim_matches(|ch: char| matches!(ch, ',' | ';' | '.')))
1980 {
1981 continue;
1982 }
1983 let Some(value) = parse_integer_token(raw) else {
1984 continue;
1985 };
1986 tokens.push(LayoutBarToken {
1987 bbox: word.bbox.clone(),
1988 value,
1989 text: sanitize_numberish_token(raw).unwrap_or_else(|| value.to_string()),
1990 });
1991 }
1992 }
1993
1994 let mut used = vec![false; tokens.len()];
1995 let mut values = Vec::with_capacity(anchors.len());
1996 for anchor in anchors {
1997 let anchor_center_x = anchor.bbox.center_x();
1998 let anchor_center_y = anchor.bbox.center_y();
1999 let best = tokens
2000 .iter()
2001 .enumerate()
2002 .filter(|(idx, token)| {
2003 !used[*idx]
2004 && token.bbox.center_y() > anchor_center_y + 8.0
2005 && (token.bbox.center_x() - anchor_center_x).abs() <= max_dx
2006 })
2007 .min_by(|left, right| {
2008 let left_score = (left.1.bbox.center_x() - anchor_center_x).abs()
2009 + (left.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
2010 let right_score = (right.1.bbox.center_x() - anchor_center_x).abs()
2011 + (right.1.bbox.center_y() - anchor_center_y).abs() * 0.05;
2012 left_score
2013 .partial_cmp(&right_score)
2014 .unwrap_or(std::cmp::Ordering::Equal)
2015 });
2016 let Some((best_idx, token)) = best else {
2017 return Vec::new();
2018 };
2019 used[best_idx] = true;
2020 values.push(token.text.clone());
2021 }
2022
2023 values
2024}
2025
2026#[cfg(not(target_arch = "wasm32"))]
2027fn detect_layout_recommendation_infographic(
2028 page_width: f64,
2029 lines: &[BBoxLayoutLine],
2030) -> Option<LayoutRecommendationInfographic> {
2031 if page_width < 900.0 {
2032 return None;
2033 }
2034
2035 let blocks = collect_bbox_layout_blocks(lines);
2036 let page_top = lines
2037 .iter()
2038 .map(|line| line.bbox.top_y)
2039 .fold(0.0_f64, f64::max);
2040
2041 let title_block = blocks
2042 .iter()
2043 .filter(|block| {
2044 block.bbox.width() >= page_width * 0.55
2045 && block.bbox.top_y >= page_top - 105.0
2046 && bbox_layout_block_text(block).split_whitespace().count() >= 8
2047 })
2048 .max_by(|left, right| {
2049 left.bbox
2050 .width()
2051 .partial_cmp(&right.bbox.width())
2052 .unwrap_or(std::cmp::Ordering::Equal)
2053 })?;
2054 let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
2055 if title.split_whitespace().count() < 8 {
2056 return None;
2057 }
2058
2059 let eyebrow = blocks
2060 .iter()
2061 .filter(|block| {
2062 block.block_id != title_block.block_id
2063 && block.bbox.top_y > title_block.bbox.top_y
2064 && block.bbox.width() >= page_width * 0.1
2065 })
2066 .max_by(|left, right| {
2067 left.bbox
2068 .top_y
2069 .partial_cmp(&right.bbox.top_y)
2070 .unwrap_or(std::cmp::Ordering::Equal)
2071 })
2072 .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
2073
2074 let title_bottom = title_block.bbox.bottom_y;
2075 let region_width = page_width / 3.0;
2076 let left_panel = detect_layout_recommendation_hit_ratio_panel(
2077 &blocks,
2078 lines,
2079 0.0,
2080 region_width,
2081 title_bottom,
2082 )?;
2083 let middle_panel = detect_layout_recommendation_ranking_panel(
2084 &blocks,
2085 lines,
2086 region_width,
2087 region_width * 2.0,
2088 title_bottom,
2089 )?;
2090 let right_panel = detect_layout_recommendation_accuracy_panel(
2091 &blocks,
2092 lines,
2093 region_width * 2.0,
2094 page_width,
2095 title_bottom,
2096 )?;
2097
2098 Some(LayoutRecommendationInfographic {
2099 eyebrow,
2100 title,
2101 panels: vec![left_panel, middle_panel, right_panel],
2102 })
2103}
2104
2105#[cfg(not(target_arch = "wasm32"))]
2106#[allow(dead_code)]
2107fn render_layout_ocr_benchmark_dashboard_document(doc: &PdfDocument) -> Option<String> {
2108 let mut layout_cache = LayoutSourceCache::default();
2109 render_layout_ocr_benchmark_dashboard_document_cached(doc, &mut layout_cache)
2110}
2111
2112#[cfg(not(target_arch = "wasm32"))]
2113fn render_layout_ocr_benchmark_dashboard_document_cached(
2114 doc: &PdfDocument,
2115 layout_cache: &mut LayoutSourceCache,
2116) -> Option<String> {
2117 if doc.number_of_pages != 1 {
2118 return None;
2119 }
2120
2121 let layout = layout_cache.bbox_layout(doc)?;
2122 let dashboard = detect_layout_ocr_benchmark_dashboard(layout.page_width, &layout.lines)?;
2123
2124 let mut output = String::new();
2125 if let Some(eyebrow) = dashboard.eyebrow.as_deref() {
2126 output.push_str("## ");
2127 output.push_str(eyebrow.trim());
2128 output.push_str("\n\n");
2129 }
2130 output.push_str("# ");
2131 output.push_str(dashboard.title.trim());
2132 output.push_str("\n\n");
2133
2134 output.push_str("## ");
2135 output.push_str(dashboard.left_heading.trim());
2136 output.push_str("\n\n");
2137 let mut left_table = Vec::with_capacity(dashboard.left_rows.len() + 1);
2138 left_table.push({
2139 let mut row = vec!["Company".to_string()];
2140 row.extend(dashboard.left_columns.clone());
2141 row
2142 });
2143 left_table.extend(dashboard.left_rows.clone());
2144 output.push_str(&render_pipe_rows(&left_table));
2145
2146 output.push_str("## ");
2147 output.push_str(dashboard.right_heading.trim());
2148 output.push_str("\n\n");
2149 let mut right_table = Vec::with_capacity(dashboard.right_rows.len() + 1);
2150 right_table.push(vec![
2151 "Metric".to_string(),
2152 "Company A".to_string(),
2153 "Company B".to_string(),
2154 "upstage".to_string(),
2155 ]);
2156 right_table.extend(dashboard.right_rows.clone());
2157 output.push_str(&render_pipe_rows(&right_table));
2158
2159 if !dashboard.definition_notes.is_empty() {
2160 output.push_str("---\n\n");
2161 for note in &dashboard.definition_notes {
2162 output.push_str(note.trim());
2163 output.push_str("\n\n");
2164 }
2165 }
2166 if !dashboard.source_notes.is_empty() {
2167 output.push_str("---\n\n");
2168 for note in &dashboard.source_notes {
2169 output.push_str(note.trim());
2170 output.push_str("\n\n");
2171 }
2172 }
2173
2174 Some(output.trim_end().to_string() + "\n")
2175}
2176
2177#[cfg(not(target_arch = "wasm32"))]
2178fn detect_layout_ocr_benchmark_dashboard(
2179 page_width: f64,
2180 lines: &[BBoxLayoutLine],
2181) -> Option<LayoutOcrDashboard> {
2182 if page_width < 680.0 {
2183 return None;
2184 }
2185
2186 let page_mid = page_width / 2.0;
2187 let blocks = collect_bbox_layout_blocks(lines);
2188 let page_top = lines
2189 .iter()
2190 .map(|line| line.bbox.top_y)
2191 .fold(0.0_f64, f64::max);
2192
2193 let title_block = blocks
2194 .iter()
2195 .filter(|block| {
2196 block.bbox.width() >= page_width * 0.45 && block.bbox.top_y >= page_top - 40.0
2197 })
2198 .max_by(|left, right| {
2199 left.bbox
2200 .width()
2201 .partial_cmp(&right.bbox.width())
2202 .unwrap_or(std::cmp::Ordering::Equal)
2203 })?;
2204 let title = normalize_layout_dashboard_text(&bbox_layout_block_text(title_block));
2205 if title.split_whitespace().count() < 5 {
2206 return None;
2207 }
2208
2209 let eyebrow = blocks
2210 .iter()
2211 .filter(|block| {
2212 block.block_id != title_block.block_id
2213 && block.bbox.top_y > title_block.bbox.top_y
2214 && block.bbox.width() >= page_width * 0.12
2215 })
2216 .max_by(|left, right| {
2217 left.bbox
2218 .top_y
2219 .partial_cmp(&right.bbox.top_y)
2220 .unwrap_or(std::cmp::Ordering::Equal)
2221 })
2222 .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)));
2223
2224 let left_title_blocks = blocks
2225 .iter()
2226 .filter(|block| {
2227 block.bbox.right_x <= page_mid
2228 && block.bbox.top_y < title_block.bbox.bottom_y - 25.0
2229 && block.bbox.top_y > title_block.bbox.bottom_y - 95.0
2230 && !bbox_layout_block_text(block)
2231 .chars()
2232 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
2233 })
2234 .cloned()
2235 .collect::<Vec<_>>();
2236 let right_title_blocks = blocks
2237 .iter()
2238 .filter(|block| {
2239 block.bbox.left_x >= page_mid
2240 && block.bbox.top_y < title_block.bbox.bottom_y - 25.0
2241 && block.bbox.top_y > title_block.bbox.bottom_y - 95.0
2242 && !bbox_layout_block_text(block)
2243 .chars()
2244 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
2245 })
2246 .cloned()
2247 .collect::<Vec<_>>();
2248
2249 let left_heading = join_dashboard_title_blocks(&left_title_blocks)?;
2250 let right_heading = join_dashboard_title_blocks(&right_title_blocks)?;
2251 if !left_heading.to_ascii_lowercase().contains("ocr")
2252 || !right_heading.to_ascii_lowercase().contains("document")
2253 {
2254 return None;
2255 }
2256
2257 let left_group_blocks = blocks
2258 .iter()
2259 .filter(|block| {
2260 block.bbox.center_x() < page_mid
2261 && block.bbox.top_y < 90.0
2262 && bbox_layout_block_text(block).contains('(')
2263 })
2264 .cloned()
2265 .collect::<Vec<_>>();
2266 if left_group_blocks.len() != 2 {
2267 return None;
2268 }
2269 let mut left_groups = left_group_blocks
2270 .iter()
2271 .map(|block| {
2272 (
2273 block.bbox.center_x(),
2274 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
2275 )
2276 })
2277 .collect::<Vec<_>>();
2278 left_groups.sort_by(|left, right| {
2279 left.0
2280 .partial_cmp(&right.0)
2281 .unwrap_or(std::cmp::Ordering::Equal)
2282 });
2283
2284 let left_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
2285 bbox.center_x() < page_mid - 20.0 && bbox.top_y > 110.0 && bbox.top_y < 250.0
2286 });
2287 if left_value_tokens.len() < 6 {
2288 return None;
2289 }
2290
2291 let mut left_group_values = vec![Vec::<(f64, String)>::new(), Vec::new()];
2292 for (bbox, value) in left_value_tokens {
2293 let group_idx = if (bbox.center_x() - left_groups[0].0).abs()
2294 <= (bbox.center_x() - left_groups[1].0).abs()
2295 {
2296 0
2297 } else {
2298 1
2299 };
2300 left_group_values[group_idx].push((bbox.center_x(), value));
2301 }
2302 if left_group_values.iter().any(|values| values.len() < 3) {
2303 return None;
2304 }
2305 for values in &mut left_group_values {
2306 values.sort_by(|left, right| {
2307 left.0
2308 .partial_cmp(&right.0)
2309 .unwrap_or(std::cmp::Ordering::Equal)
2310 });
2311 values.truncate(3);
2312 }
2313
2314 let mut company_labels = extract_dashboard_company_labels(&blocks, page_mid);
2315 if company_labels.len() < 2 {
2316 return None;
2317 }
2318 company_labels.truncate(2);
2319 company_labels.push(infer_dashboard_brand_name(&left_heading));
2320
2321 let mut left_rows = Vec::new();
2322 for row_idx in 0..3 {
2323 left_rows.push(vec![
2324 company_labels[row_idx].clone(),
2325 left_group_values[0][row_idx].1.clone(),
2326 left_group_values[1][row_idx].1.clone(),
2327 ]);
2328 }
2329
2330 let metric_blocks = blocks
2331 .iter()
2332 .filter(|block| {
2333 block.bbox.center_x() > page_mid
2334 && block.bbox.top_y > 95.0
2335 && block.bbox.top_y < 240.0
2336 && matches!(
2337 normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
2338 text if text.starts_with("ocr") || text.starts_with("parsingf1")
2339 )
2340 })
2341 .cloned()
2342 .collect::<Vec<_>>();
2343 if metric_blocks.len() < 4 {
2344 return None;
2345 }
2346
2347 let mut metrics = metric_blocks
2348 .iter()
2349 .map(|block| {
2350 (
2351 block.bbox.center_y(),
2352 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
2353 )
2354 })
2355 .collect::<Vec<_>>();
2356 metrics.sort_by(|left, right| {
2357 right
2358 .0
2359 .partial_cmp(&left.0)
2360 .unwrap_or(std::cmp::Ordering::Equal)
2361 });
2362 metrics.truncate(4);
2363
2364 let right_value_tokens = collect_layout_decimal_tokens(lines, |bbox| {
2365 bbox.center_x() > page_mid + 20.0 && bbox.top_y > 90.0 && bbox.top_y < 250.0
2366 });
2367 if right_value_tokens.len() < 10 {
2368 return None;
2369 }
2370
2371 let mut metric_values = vec![Vec::<(f64, String)>::new(); metrics.len()];
2372 for (bbox, value) in right_value_tokens {
2373 let Some((metric_idx, _)) = metrics
2374 .iter()
2375 .enumerate()
2376 .map(|(idx, (center_y, _))| (idx, (bbox.center_y() - *center_y).abs()))
2377 .min_by(|left, right| {
2378 left.1
2379 .partial_cmp(&right.1)
2380 .unwrap_or(std::cmp::Ordering::Equal)
2381 })
2382 else {
2383 continue;
2384 };
2385 metric_values[metric_idx].push((bbox.center_x(), value));
2386 }
2387
2388 let mut right_rows = Vec::new();
2389 for (idx, (_, metric_name)) in metrics.iter().enumerate() {
2390 let mut values = metric_values[idx].clone();
2391 values.sort_by(|left, right| {
2392 left.0
2393 .partial_cmp(&right.0)
2394 .unwrap_or(std::cmp::Ordering::Equal)
2395 });
2396 values.dedup_by(|left, right| left.1 == right.1);
2397 if values.len() < 2 {
2398 return None;
2399 }
2400 if values.len() == 2 {
2401 values.push(values[1].clone());
2402 }
2403 values.truncate(3);
2404 right_rows.push(vec![
2405 metric_name.clone(),
2406 normalize_layout_decimal_value(&values[0].1),
2407 normalize_layout_decimal_value(&values[1].1),
2408 normalize_layout_decimal_value(&values[2].1),
2409 ]);
2410 }
2411
2412 let definition_notes = collect_dashboard_notes(&blocks, page_mid, false);
2413 let source_notes = collect_dashboard_notes(&blocks, page_mid, true);
2414
2415 Some(LayoutOcrDashboard {
2416 eyebrow,
2417 title,
2418 left_heading,
2419 left_columns: left_groups.into_iter().map(|(_, text)| text).collect(),
2420 left_rows,
2421 right_heading,
2422 right_rows,
2423 definition_notes,
2424 source_notes,
2425 })
2426}
2427
2428#[cfg(not(target_arch = "wasm32"))]
2429fn detect_layout_recommendation_hit_ratio_panel(
2430 blocks: &[BBoxLayoutBlock],
2431 lines: &[BBoxLayoutLine],
2432 left_x: f64,
2433 right_x: f64,
2434 title_bottom: f64,
2435) -> Option<LayoutRecommendationPanel> {
2436 let (heading_block, subtitle_block) =
2437 extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2438 let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2439 let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2440 let width = right_x - left_x;
2441 let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2442
2443 let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2444 bbox.center_x() > left_x + width * 0.52
2445 && bbox.center_x() < right_x - 8.0
2446 && bbox.top_y < chart_cutoff
2447 });
2448 values.sort_by(|left, right| {
2449 right
2450 .0
2451 .center_y()
2452 .partial_cmp(&left.0.center_y())
2453 .unwrap_or(std::cmp::Ordering::Equal)
2454 });
2455 values.dedup_by(|left, right| {
2456 (left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
2457 });
2458 if values.len() < 4 {
2459 return None;
2460 }
2461
2462 let labels = collect_layout_panel_alpha_blocks(
2463 blocks,
2464 left_x,
2465 right_x,
2466 title_bottom,
2467 chart_cutoff,
2468 Some(left_x + width * 0.55),
2469 );
2470 let rows = pair_layout_decimal_rows(&labels, &values, 4)?;
2471 let notes = pair_layout_emphasis_notes(
2472 &rows,
2473 &collect_layout_emphasis_tokens(lines, |bbox| {
2474 bbox.center_x() > left_x + width * 0.48
2475 && bbox.center_x() < right_x
2476 && bbox.top_y < chart_cutoff
2477 }),
2478 "increase",
2479 );
2480 let metric_label =
2481 extract_layout_comparison_metric(&subtitle).unwrap_or_else(|| "Value".to_string());
2482
2483 Some(LayoutRecommendationPanel {
2484 heading,
2485 subtitle,
2486 header: vec!["Model".to_string(), metric_label],
2487 rows,
2488 notes,
2489 })
2490}
2491
2492#[cfg(not(target_arch = "wasm32"))]
2493fn detect_layout_recommendation_ranking_panel(
2494 blocks: &[BBoxLayoutBlock],
2495 lines: &[BBoxLayoutLine],
2496 left_x: f64,
2497 right_x: f64,
2498 title_bottom: f64,
2499) -> Option<LayoutRecommendationPanel> {
2500 let (heading_block, subtitle_block) =
2501 extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2502 let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2503 let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2504 let width = right_x - left_x;
2505 let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2506
2507 let row_labels = collect_layout_panel_alpha_blocks(
2508 blocks,
2509 left_x,
2510 right_x,
2511 title_bottom,
2512 chart_cutoff,
2513 Some(left_x + width * 0.48),
2514 )
2515 .into_iter()
2516 .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(&block)))
2517 .collect::<Vec<_>>();
2518 if row_labels.len() < 8 {
2519 return None;
2520 }
2521
2522 let headers = extract_layout_ranking_headers(blocks, left_x, right_x, chart_cutoff)
2523 .unwrap_or_else(|| vec!["Recall@10".to_string(), "Accuracy".to_string()]);
2524 let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2525 bbox.center_x() > left_x + width * 0.42
2526 && bbox.center_x() < right_x - 10.0
2527 && bbox.top_y < chart_cutoff
2528 });
2529 values.sort_by(|left, right| {
2530 left.0
2531 .left_x
2532 .partial_cmp(&right.0.left_x)
2533 .unwrap_or(std::cmp::Ordering::Equal)
2534 });
2535
2536 let mut rows = row_labels
2537 .into_iter()
2538 .map(|label| vec![label, String::new(), String::new()])
2539 .collect::<Vec<_>>();
2540 if let Some(first) = rows.first_mut() {
2541 if let Some((_, value)) = values.first() {
2542 first[1] = normalize_layout_decimal_value(value);
2543 }
2544 if let Some((_, value)) = values.get(1) {
2545 first[2] = normalize_layout_decimal_value(value);
2546 }
2547 }
2548
2549 let mut notes = collect_layout_ranking_notes(blocks, left_x, right_x, chart_cutoff);
2550 notes.extend(
2551 collect_layout_emphasis_tokens(lines, |bbox| {
2552 bbox.center_x() > left_x + width * 0.55
2553 && bbox.center_x() < right_x
2554 && bbox.top_y < chart_cutoff
2555 })
2556 .into_iter()
2557 .map(|(_, token)| format!("{} increase", token.trim_end_matches('↑'))),
2558 );
2559
2560 Some(LayoutRecommendationPanel {
2561 heading,
2562 subtitle,
2563 header: vec!["Method".to_string(), headers[0].clone(), headers[1].clone()],
2564 rows,
2565 notes,
2566 })
2567}
2568
2569#[cfg(not(target_arch = "wasm32"))]
2570fn detect_layout_recommendation_accuracy_panel(
2571 blocks: &[BBoxLayoutBlock],
2572 lines: &[BBoxLayoutLine],
2573 left_x: f64,
2574 right_x: f64,
2575 title_bottom: f64,
2576) -> Option<LayoutRecommendationPanel> {
2577 let (heading_block, subtitle_block) =
2578 extract_layout_panel_heading_and_subtitle(blocks, left_x, right_x, title_bottom)?;
2579 let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(&heading_block));
2580 let subtitle = normalize_layout_dashboard_text(&bbox_layout_block_text(&subtitle_block));
2581 let chart_cutoff = subtitle_block.bbox.bottom_y - 10.0;
2582
2583 let mut values = collect_layout_decimal_tokens(lines, |bbox| {
2584 bbox.center_x() > left_x + 20.0 && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
2585 });
2586 values.sort_by(|left, right| {
2587 right
2588 .0
2589 .center_y()
2590 .partial_cmp(&left.0.center_y())
2591 .unwrap_or(std::cmp::Ordering::Equal)
2592 });
2593 values.dedup_by(|left, right| {
2594 (left.0.center_y() - right.0.center_y()).abs() <= 8.0 && left.1 == right.1
2595 });
2596 if values.len() < 2 {
2597 return None;
2598 }
2599 let min_value_top_y = values
2600 .iter()
2601 .map(|(bbox, _)| bbox.top_y)
2602 .fold(f64::INFINITY, f64::min);
2603
2604 let labels = collect_layout_panel_alpha_blocks(
2605 blocks,
2606 left_x,
2607 right_x,
2608 title_bottom,
2609 chart_cutoff,
2610 None,
2611 )
2612 .into_iter()
2613 .filter(|block| block.bbox.top_y < min_value_top_y - 70.0)
2614 .collect::<Vec<_>>();
2615 let rows = pair_layout_decimal_rows(&labels, &values, 2)?;
2616
2617 let mut notes = Vec::new();
2618 if let Some(description) = collect_layout_note_phrase(blocks, left_x, right_x, chart_cutoff) {
2619 if let Some((_, emphasis)) = collect_layout_emphasis_tokens(lines, |bbox| {
2620 bbox.center_x() > left_x && bbox.center_x() < right_x && bbox.top_y < chart_cutoff
2621 })
2622 .into_iter()
2623 .next()
2624 {
2625 notes.push(format!(
2626 "{}, {} increase",
2627 description,
2628 emphasis.trim_end_matches('↑')
2629 ));
2630 }
2631 }
2632
2633 Some(LayoutRecommendationPanel {
2634 heading,
2635 subtitle,
2636 header: vec!["Model".to_string(), "Accuracy".to_string()],
2637 rows,
2638 notes,
2639 })
2640}
2641
2642#[cfg(not(target_arch = "wasm32"))]
2643fn extract_layout_panel_heading_and_subtitle(
2644 blocks: &[BBoxLayoutBlock],
2645 left_x: f64,
2646 right_x: f64,
2647 title_bottom: f64,
2648) -> Option<(BBoxLayoutBlock, BBoxLayoutBlock)> {
2649 let mut band_blocks = blocks
2650 .iter()
2651 .filter(|block| {
2652 block.bbox.center_x() >= left_x
2653 && block.bbox.center_x() <= right_x
2654 && block.bbox.top_y < title_bottom - 8.0
2655 && block.bbox.top_y > title_bottom - 90.0
2656 && bbox_layout_block_text(block)
2657 .chars()
2658 .any(char::is_alphabetic)
2659 })
2660 .cloned()
2661 .collect::<Vec<_>>();
2662 band_blocks.sort_by(|left, right| {
2663 right
2664 .bbox
2665 .top_y
2666 .partial_cmp(&left.bbox.top_y)
2667 .unwrap_or(std::cmp::Ordering::Equal)
2668 });
2669
2670 let heading = band_blocks.first()?.clone();
2671 let subtitle = band_blocks
2672 .iter()
2673 .find(|block| {
2674 block.block_id != heading.block_id
2675 && block.bbox.top_y < heading.bbox.bottom_y + 8.0
2676 && block.bbox.top_y > heading.bbox.bottom_y - 40.0
2677 })?
2678 .clone();
2679 Some((heading, subtitle))
2680}
2681
2682#[cfg(not(target_arch = "wasm32"))]
2683fn collect_layout_panel_alpha_blocks(
2684 blocks: &[BBoxLayoutBlock],
2685 left_x: f64,
2686 right_x: f64,
2687 title_bottom: f64,
2688 chart_cutoff: f64,
2689 max_left_x: Option<f64>,
2690) -> Vec<BBoxLayoutBlock> {
2691 let mut alpha_blocks = blocks
2692 .iter()
2693 .filter(|block| {
2694 block.bbox.center_x() >= left_x
2695 && block.bbox.center_x() <= right_x
2696 && block.bbox.top_y < chart_cutoff
2697 && block.bbox.top_y > title_bottom - 390.0
2698 && max_left_x.is_none_or(|limit| block.bbox.left_x <= limit)
2699 })
2700 .filter_map(|block| {
2701 let text = normalize_layout_panel_text(&bbox_layout_block_text(block));
2702 let token_count = text.split_whitespace().count();
2703 let has_alpha = text.chars().any(char::is_alphabetic);
2704 let has_numeric_marker = text
2705 .chars()
2706 .any(|ch| ch.is_ascii_digit() || ch == '%' || ch == ':');
2707 (has_alpha
2708 && token_count >= 1
2709 && !has_numeric_marker
2710 && !text.starts_with(':')
2711 && !text.eq_ignore_ascii_case("comparison"))
2712 .then_some(block.clone())
2713 })
2714 .collect::<Vec<_>>();
2715 alpha_blocks.sort_by(|left, right| {
2716 right
2717 .bbox
2718 .center_y()
2719 .partial_cmp(&left.bbox.center_y())
2720 .unwrap_or(std::cmp::Ordering::Equal)
2721 });
2722 alpha_blocks
2723}
2724
2725#[cfg(not(target_arch = "wasm32"))]
2726fn pair_layout_decimal_rows(
2727 label_blocks: &[BBoxLayoutBlock],
2728 value_tokens: &[(BoundingBox, String)],
2729 expected_len: usize,
2730) -> Option<Vec<Vec<String>>> {
2731 let mut used = HashSet::new();
2732 let mut rows = Vec::new();
2733
2734 for (bbox, value) in value_tokens.iter().take(expected_len) {
2735 let Some((label_idx, _)) = label_blocks
2736 .iter()
2737 .enumerate()
2738 .filter(|(idx, block)| {
2739 !used.contains(idx) && block.bbox.center_x() <= bbox.center_x() + 24.0
2740 })
2741 .map(|(idx, block)| (idx, (block.bbox.center_y() - bbox.center_y()).abs()))
2742 .min_by(|left, right| {
2743 left.1
2744 .partial_cmp(&right.1)
2745 .unwrap_or(std::cmp::Ordering::Equal)
2746 })
2747 else {
2748 continue;
2749 };
2750 if label_blocks[label_idx].bbox.center_y() - bbox.center_y() > 30.0 {
2751 continue;
2752 }
2753
2754 used.insert(label_idx);
2755 rows.push(vec![
2756 normalize_layout_panel_text(&bbox_layout_block_text(&label_blocks[label_idx])),
2757 normalize_layout_decimal_value(value),
2758 ]);
2759 }
2760
2761 (rows.len() >= expected_len).then_some(rows)
2762}
2763
2764#[cfg(not(target_arch = "wasm32"))]
2765fn collect_layout_emphasis_tokens<F>(
2766 lines: &[BBoxLayoutLine],
2767 bbox_filter: F,
2768) -> Vec<(BoundingBox, String)>
2769where
2770 F: Fn(&BoundingBox) -> bool,
2771{
2772 let emphasis_re = Regex::new(r"^\d+(?:\.\d+)?(?:X|%)↑?$").ok();
2773 let Some(emphasis_re) = emphasis_re else {
2774 return Vec::new();
2775 };
2776
2777 let mut tokens = Vec::new();
2778 for line in lines {
2779 for word in &line.words {
2780 let candidate = word.text.trim();
2781 if bbox_filter(&word.bbox) && emphasis_re.is_match(candidate) {
2782 tokens.push((word.bbox.clone(), candidate.to_string()));
2783 }
2784 }
2785 }
2786 tokens.sort_by(|left, right| {
2787 right
2788 .0
2789 .center_y()
2790 .partial_cmp(&left.0.center_y())
2791 .unwrap_or(std::cmp::Ordering::Equal)
2792 });
2793 tokens
2794}
2795
2796#[cfg(not(target_arch = "wasm32"))]
2797fn pair_layout_emphasis_notes(
2798 rows: &[Vec<String>],
2799 emphasis_tokens: &[(BoundingBox, String)],
2800 suffix: &str,
2801) -> Vec<String> {
2802 let mut notes = Vec::new();
2803 for ((_, token), row) in emphasis_tokens.iter().zip(rows.iter().skip(2)) {
2804 if let Some(label) = row.first() {
2805 notes.push(format!(
2806 "{}: {} {}",
2807 label.trim(),
2808 token.trim_end_matches('↑'),
2809 suffix
2810 ));
2811 }
2812 }
2813 notes
2814}
2815
2816#[cfg(not(target_arch = "wasm32"))]
2817fn extract_layout_comparison_metric(text: &str) -> Option<String> {
2818 let tokens = text.split_whitespace().collect::<Vec<_>>();
2819 let comparison_idx = tokens
2820 .iter()
2821 .position(|token| token.eq_ignore_ascii_case("comparison"))?;
2822 if comparison_idx < 2 {
2823 return None;
2824 }
2825 let metric = tokens[comparison_idx.saturating_sub(2)..comparison_idx].join(" ");
2826 (!metric.trim().is_empty()).then_some(metric)
2827}
2828
2829#[cfg(not(target_arch = "wasm32"))]
2830fn title_case_metric_label(text: &str) -> String {
2831 let trimmed = text.trim();
2832 if trimmed.is_empty() {
2833 return String::new();
2834 }
2835 let mut out = String::new();
2836 for (idx, token) in trimmed.split_whitespace().enumerate() {
2837 if idx > 0 {
2838 out.push(' ');
2839 }
2840 if token
2841 .chars()
2842 .all(|ch| !ch.is_ascii_alphabetic() || ch.is_uppercase())
2843 {
2844 out.push_str(token);
2845 } else {
2846 let mut chars = token.chars();
2847 if let Some(first) = chars.next() {
2848 out.push(first.to_ascii_uppercase());
2849 for ch in chars {
2850 out.push(ch);
2851 }
2852 }
2853 }
2854 }
2855 out
2856}
2857
2858#[cfg(not(target_arch = "wasm32"))]
2859fn normalize_layout_panel_text(text: &str) -> String {
2860 normalize_layout_dashboard_text(text)
2861 .replace(" _", "_")
2862 .replace("_ ", "_")
2863}
2864
2865#[cfg(not(target_arch = "wasm32"))]
2866fn extract_layout_ranking_headers(
2867 blocks: &[BBoxLayoutBlock],
2868 left_x: f64,
2869 right_x: f64,
2870 chart_cutoff: f64,
2871) -> Option<Vec<String>> {
2872 let legend = blocks
2873 .iter()
2874 .filter(|block| {
2875 block.bbox.center_x() >= left_x
2876 && block.bbox.center_x() <= right_x
2877 && block.bbox.top_y < chart_cutoff
2878 && bbox_layout_block_text(block).contains(':')
2879 })
2880 .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
2881 .collect::<Vec<_>>();
2882 for line in legend {
2883 let segments = line
2884 .split(':')
2885 .map(str::trim)
2886 .filter(|segment| !segment.is_empty())
2887 .collect::<Vec<_>>();
2888 let Some(first_segment) = segments.first() else {
2889 continue;
2890 };
2891 let metrics = first_segment
2892 .split(',')
2893 .map(title_case_metric_label)
2894 .filter(|part| !part.trim().is_empty())
2895 .collect::<Vec<_>>();
2896 if metrics.len() >= 2 {
2897 return Some(vec![metrics[0].clone(), metrics[1].clone()]);
2898 }
2899 }
2900 None
2901}
2902
2903#[cfg(not(target_arch = "wasm32"))]
2904fn collect_layout_ranking_notes(
2905 blocks: &[BBoxLayoutBlock],
2906 left_x: f64,
2907 right_x: f64,
2908 chart_cutoff: f64,
2909) -> Vec<String> {
2910 blocks
2911 .iter()
2912 .filter(|block| {
2913 block.bbox.center_x() >= left_x
2914 && block.bbox.center_x() <= right_x
2915 && block.bbox.top_y < chart_cutoff
2916 && bbox_layout_block_text(block).contains(':')
2917 })
2918 .flat_map(|block| {
2919 normalize_layout_panel_text(&bbox_layout_block_text(block))
2920 .split(':')
2921 .map(str::trim)
2922 .filter(|segment| !segment.is_empty())
2923 .map(ToString::to_string)
2924 .collect::<Vec<_>>()
2925 })
2926 .filter(|note| !note.eq_ignore_ascii_case("recall@10, accuracy"))
2927 .collect()
2928}
2929
2930#[cfg(not(target_arch = "wasm32"))]
2931fn collect_layout_note_phrase(
2932 blocks: &[BBoxLayoutBlock],
2933 left_x: f64,
2934 right_x: f64,
2935 chart_cutoff: f64,
2936) -> Option<String> {
2937 blocks
2938 .iter()
2939 .filter(|block| {
2940 block.bbox.center_x() >= left_x
2941 && block.bbox.center_x() <= right_x
2942 && block.bbox.top_y < chart_cutoff
2943 && bbox_layout_block_text(block).split_whitespace().count() >= 3
2944 })
2945 .map(|block| normalize_layout_panel_text(&bbox_layout_block_text(block)))
2946 .find(|text| text.to_ascii_lowercase().contains("compared"))
2947}
2948
2949#[cfg(not(target_arch = "wasm32"))]
2950fn collect_bbox_layout_blocks(lines: &[BBoxLayoutLine]) -> Vec<BBoxLayoutBlock> {
2951 let mut grouped: HashMap<usize, Vec<BBoxLayoutLine>> = HashMap::new();
2952 for line in lines {
2953 grouped.entry(line.block_id).or_default().push(line.clone());
2954 }
2955
2956 let mut blocks = grouped
2957 .into_iter()
2958 .map(|(block_id, mut lines)| {
2959 lines.sort_by(|left, right| {
2960 cmp_banded_reading_order(&left.bbox, &right.bbox, 3.0)
2961 .then_with(|| left.block_id.cmp(&right.block_id))
2962 });
2963 let bbox = lines
2964 .iter()
2965 .skip(1)
2966 .fold(lines[0].bbox.clone(), |acc, line| acc.union(&line.bbox));
2967 BBoxLayoutBlock {
2968 block_id,
2969 bbox,
2970 lines,
2971 }
2972 })
2973 .collect::<Vec<_>>();
2974 blocks.sort_by(|left, right| {
2975 cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
2976 .then_with(|| left.block_id.cmp(&right.block_id))
2977 });
2978 blocks
2979}
2980
2981#[cfg(not(target_arch = "wasm32"))]
2982fn bbox_layout_block_text(block: &BBoxLayoutBlock) -> String {
2983 join_layout_lines_as_paragraph(&block.lines.iter().collect::<Vec<_>>())
2984}
2985
2986#[cfg(not(target_arch = "wasm32"))]
2987fn join_dashboard_title_blocks(blocks: &[BBoxLayoutBlock]) -> Option<String> {
2988 let mut blocks = blocks.to_vec();
2989 blocks.sort_by(|left, right| {
2990 right
2991 .bbox
2992 .top_y
2993 .partial_cmp(&left.bbox.top_y)
2994 .unwrap_or(std::cmp::Ordering::Equal)
2995 });
2996 let text = blocks
2997 .iter()
2998 .map(bbox_layout_block_text)
2999 .filter(|text| !text.trim().is_empty())
3000 .collect::<Vec<_>>()
3001 .join(" ");
3002 let normalized = normalize_layout_dashboard_text(&text);
3003 (!normalized.trim().is_empty()).then_some(normalized)
3004}
3005
3006#[cfg(not(target_arch = "wasm32"))]
3007fn collect_layout_decimal_tokens<F>(
3008 lines: &[BBoxLayoutLine],
3009 bbox_filter: F,
3010) -> Vec<(BoundingBox, String)>
3011where
3012 F: Fn(&BoundingBox) -> bool,
3013{
3014 let decimal_re = Regex::new(r"^\d+\.\d+$|^\d+\.$").ok();
3015 let Some(decimal_re) = decimal_re else {
3016 return Vec::new();
3017 };
3018
3019 let mut tokens = Vec::new();
3020 for line in lines {
3021 for word in &line.words {
3022 let candidate = word.text.trim().trim_matches(|ch| ch == ',' || ch == ';');
3023 if !bbox_filter(&word.bbox) || !decimal_re.is_match(candidate) {
3024 continue;
3025 }
3026 tokens.push((word.bbox.clone(), candidate.to_string()));
3027 }
3028 }
3029 tokens
3030}
3031
3032#[cfg(not(target_arch = "wasm32"))]
3033fn extract_dashboard_company_labels(blocks: &[BBoxLayoutBlock], page_mid: f64) -> Vec<String> {
3034 let company_blocks = blocks
3035 .iter()
3036 .filter(|block| {
3037 block.bbox.center_x() < page_mid
3038 && (65.0..110.0).contains(&block.bbox.top_y)
3039 && bbox_layout_block_text(block) == "Company"
3040 })
3041 .collect::<Vec<_>>();
3042 let marker_blocks = blocks
3043 .iter()
3044 .filter(|block| {
3045 block.bbox.center_x() < page_mid
3046 && (60.0..105.0).contains(&block.bbox.top_y)
3047 && matches!(
3048 normalize_heading_text(&bbox_layout_block_text(block)).as_str(),
3049 "a2" | "b2"
3050 )
3051 })
3052 .map(|block| {
3053 (
3054 block.bbox.center_x(),
3055 block.bbox.center_y(),
3056 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3057 )
3058 })
3059 .collect::<Vec<_>>();
3060
3061 let mut labels = Vec::new();
3062 for company in company_blocks {
3063 if let Some((_, marker_y, marker)) = marker_blocks.iter().min_by(|left, right| {
3064 let left_distance = ((left.0 - company.bbox.center_x()).powi(2)
3065 + (left.1 - company.bbox.center_y()).powi(2))
3066 .sqrt();
3067 let right_distance = ((right.0 - company.bbox.center_x()).powi(2)
3068 + (right.1 - company.bbox.center_y()).powi(2))
3069 .sqrt();
3070 left_distance
3071 .partial_cmp(&right_distance)
3072 .unwrap_or(std::cmp::Ordering::Equal)
3073 }) {
3074 if (company.bbox.center_y() - *marker_y).abs() <= 16.0 || marker_blocks.len() == 1 {
3075 labels.push(format!("{} {}", bbox_layout_block_text(company), marker));
3076 }
3077 }
3078 }
3079
3080 if labels.len() < 2 {
3081 labels.extend(
3082 marker_blocks
3083 .iter()
3084 .map(|(_, _, marker)| format!("Company {marker}")),
3085 );
3086 }
3087
3088 labels.sort();
3089 labels.dedup();
3090 labels
3091}
3092
3093#[cfg(not(target_arch = "wasm32"))]
3094fn infer_dashboard_brand_name(text: &str) -> String {
3095 text.split_whitespace()
3096 .next()
3097 .map(|token| token.trim_matches(|ch: char| !ch.is_alphanumeric()))
3098 .filter(|token| !token.is_empty())
3099 .map(|token| token.to_ascii_lowercase())
3100 .unwrap_or_else(|| "model".to_string())
3101}
3102
3103#[cfg(not(target_arch = "wasm32"))]
3104fn collect_dashboard_notes(
3105 blocks: &[BBoxLayoutBlock],
3106 page_mid: f64,
3107 left_half: bool,
3108) -> Vec<String> {
3109 let notes = blocks
3110 .iter()
3111 .filter(|block| {
3112 let in_half = if left_half {
3113 block.bbox.center_x() < page_mid
3114 } else {
3115 block.bbox.center_x() > page_mid
3116 };
3117 in_half && block.bbox.top_y < 50.0
3118 })
3119 .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
3120 .filter(|text| !text.trim().is_empty())
3121 .collect::<Vec<_>>();
3122
3123 let mut merged = Vec::new();
3124 for note in notes {
3125 if note
3126 .chars()
3127 .next()
3128 .is_some_and(|ch| matches!(ch, '¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹'))
3129 {
3130 merged.push(note);
3131 } else if let Some(previous) = merged.last_mut() {
3132 append_cell_text(previous, ¬e);
3133 } else {
3134 merged.push(note);
3135 }
3136 }
3137 merged
3138}
3139
3140#[cfg(not(target_arch = "wasm32"))]
3141fn normalize_layout_dashboard_text(text: &str) -> String {
3142 let normalized = normalize_common_ocr_text(text.trim());
3143 let degree_marker_re = Regex::new(r"(\d)[°º]").ok();
3144 let split_suffix_re = Regex::new(r"\b([A-Za-z])(\d)\s+(\d)\b").ok();
3145 let single_letter_marker_re = Regex::new(r"\b([A-Za-z])\s+(\d{1,2})\b").ok();
3146 let trailing_block_marker_re = Regex::new(r"([A-Za-z][A-Za-z0-9\-]*)\s+(\d{1,2})$").ok();
3147 let trailing_marker_re = Regex::new(r"([[:alpha:]\)])(\d{1,2})\b").ok();
3148 let leading_marker_re = Regex::new(r"^(\d{1,2})([.)]?)\s+").ok();
3149
3150 let cleaned_degree = degree_marker_re
3151 .as_ref()
3152 .map(|re| {
3153 re.replace_all(&normalized, |captures: ®ex::Captures<'_>| {
3154 format!("{} ", &captures[1])
3155 })
3156 .to_string()
3157 })
3158 .unwrap_or(normalized);
3159
3160 let collapsed_suffix = split_suffix_re
3161 .as_ref()
3162 .map(|re| {
3163 re.replace_all(&cleaned_degree, |captures: ®ex::Captures<'_>| {
3164 format!("{}{}{}", &captures[1], &captures[2], &captures[3])
3165 })
3166 .to_string()
3167 })
3168 .unwrap_or(cleaned_degree);
3169
3170 let collapsed_spacing = single_letter_marker_re
3171 .as_ref()
3172 .map(|re| {
3173 re.replace_all(&collapsed_suffix, |captures: ®ex::Captures<'_>| {
3174 format!("{}{}", &captures[1], &captures[2])
3175 })
3176 .to_string()
3177 })
3178 .unwrap_or(collapsed_suffix);
3179
3180 let collapsed_terminal_marker = trailing_block_marker_re
3181 .as_ref()
3182 .map(|re| {
3183 re.replace(&collapsed_spacing, |captures: ®ex::Captures<'_>| {
3184 format!("{}{}", &captures[1], &captures[2])
3185 })
3186 .to_string()
3187 })
3188 .unwrap_or(collapsed_spacing);
3189
3190 let with_inline = trailing_marker_re
3191 .as_ref()
3192 .map(|re| {
3193 re.replace_all(
3194 &collapsed_terminal_marker,
3195 |captures: ®ex::Captures<'_>| {
3196 format!("{}{}", &captures[1], superscript_digits(&captures[2]))
3197 },
3198 )
3199 .to_string()
3200 })
3201 .unwrap_or(collapsed_terminal_marker);
3202
3203 leading_marker_re
3204 .as_ref()
3205 .map(|re| {
3206 re.replace(&with_inline, |captures: ®ex::Captures<'_>| {
3207 format!("{} ", superscript_digits(&captures[1]))
3208 })
3209 .to_string()
3210 })
3211 .unwrap_or(with_inline)
3212}
3213
3214#[cfg(not(target_arch = "wasm32"))]
3215fn normalize_layout_decimal_value(value: &str) -> String {
3216 value.trim_end_matches('.').to_string()
3217}
3218
3219#[cfg(not(target_arch = "wasm32"))]
3220fn superscript_digits(text: &str) -> String {
3221 text.chars()
3222 .map(|ch| match ch {
3223 '0' => '⁰',
3224 '1' => '¹',
3225 '2' => '²',
3226 '3' => '³',
3227 '4' => '⁴',
3228 '5' => '⁵',
3229 '6' => '⁶',
3230 '7' => '⁷',
3231 '8' => '⁸',
3232 '9' => '⁹',
3233 _ => ch,
3234 })
3235 .collect()
3236}
3237
3238#[cfg(not(target_arch = "wasm32"))]
3239fn collect_layout_figure_captions(blocks: &[BBoxLayoutBlock]) -> Vec<BBoxLayoutBlock> {
3240 let mut captions = blocks
3241 .iter()
3242 .filter(|block| {
3243 let text = bbox_layout_block_text(block);
3244 text.starts_with("Figure ")
3245 && text.contains(':')
3246 && text.split_whitespace().count() >= 8
3247 })
3248 .cloned()
3249 .collect::<Vec<_>>();
3250 captions.sort_by(|left, right| {
3251 right
3252 .bbox
3253 .top_y
3254 .partial_cmp(&left.bbox.top_y)
3255 .unwrap_or(std::cmp::Ordering::Equal)
3256 });
3257 captions
3258}
3259
3260#[cfg(not(target_arch = "wasm32"))]
3261fn collect_layout_integer_tokens<F>(lines: &[BBoxLayoutLine], bbox_filter: F) -> Vec<LayoutBarToken>
3262where
3263 F: Fn(&BoundingBox) -> bool,
3264{
3265 let integer_re = Regex::new(r"^\d+$").ok();
3266 let Some(integer_re) = integer_re else {
3267 return Vec::new();
3268 };
3269
3270 let mut tokens = Vec::new();
3271 for line in lines {
3272 for word in &line.words {
3273 let candidate = word.text.trim();
3274 if !bbox_filter(&word.bbox) || !integer_re.is_match(candidate) {
3275 continue;
3276 }
3277 let Ok(value) = candidate.parse::<i64>() else {
3278 continue;
3279 };
3280 tokens.push(LayoutBarToken {
3281 bbox: word.bbox.clone(),
3282 value,
3283 text: candidate.to_string(),
3284 });
3285 }
3286 }
3287 tokens
3288}
3289
3290#[cfg(not(target_arch = "wasm32"))]
3291fn detect_layout_three_month_stacked_figure(
3292 blocks: &[BBoxLayoutBlock],
3293 lines: &[BBoxLayoutLine],
3294 page_width: f64,
3295 caption_block: BBoxLayoutBlock,
3296 next_caption_top_y: f64,
3297) -> Option<LayoutStackedBarFigure> {
3298 let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
3299 let month_blocks = collect_layout_month_blocks(
3300 blocks,
3301 caption_block.bbox.bottom_y - 150.0,
3302 caption_block.bbox.bottom_y - 230.0,
3303 None,
3304 );
3305 if month_blocks.len() != 3 {
3306 return None;
3307 }
3308 let legend_blocks = collect_layout_legend_blocks(
3309 blocks,
3310 caption_block.bbox.bottom_y - 175.0,
3311 caption_block.bbox.bottom_y - 220.0,
3312 );
3313 if legend_blocks.len() != 3 {
3314 return None;
3315 }
3316
3317 let month_centers = month_blocks
3318 .iter()
3319 .map(|block| {
3320 (
3321 block.bbox.center_x(),
3322 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3323 )
3324 })
3325 .collect::<Vec<_>>();
3326 let month_top_y = month_blocks
3327 .iter()
3328 .map(|block| block.bbox.top_y)
3329 .fold(0.0_f64, f64::max);
3330 let first_center = month_centers.first()?.0;
3331 let last_center = month_centers.last()?.0;
3332 let tokens = collect_layout_integer_tokens(lines, |bbox| {
3333 bbox.center_x() >= first_center - 20.0
3334 && bbox.center_x() <= last_center + 20.0
3335 && bbox.center_y() > month_top_y + 10.0
3336 && bbox.top_y < caption_block.bbox.bottom_y - 25.0
3337 && bbox.bottom_y > next_caption_top_y + 55.0
3338 && bbox.left_x > page_width * 0.28
3339 });
3340 if tokens.len() < 9 {
3341 return None;
3342 }
3343
3344 let mut grouped = vec![Vec::<LayoutBarToken>::new(), Vec::new(), Vec::new()];
3345 for token in tokens {
3346 let Some((idx, distance)) = month_centers
3347 .iter()
3348 .enumerate()
3349 .map(|(idx, (center_x, _))| (idx, (token.bbox.center_x() - *center_x).abs()))
3350 .min_by(|left, right| {
3351 left.1
3352 .partial_cmp(&right.1)
3353 .unwrap_or(std::cmp::Ordering::Equal)
3354 })
3355 else {
3356 continue;
3357 };
3358 if distance <= 28.0 {
3359 grouped[idx].push(token);
3360 }
3361 }
3362 if grouped.iter().any(|bucket| bucket.len() < 3) {
3363 return None;
3364 }
3365
3366 let mut rows = vec![
3367 vec![legend_blocks[0].1.clone()],
3368 vec![legend_blocks[1].1.clone()],
3369 vec![legend_blocks[2].1.clone()],
3370 ];
3371 for bucket in &mut grouped {
3372 bucket.sort_by(|left, right| {
3373 left.bbox
3374 .center_y()
3375 .partial_cmp(&right.bbox.center_y())
3376 .unwrap_or(std::cmp::Ordering::Equal)
3377 });
3378 bucket.truncate(3);
3379 rows[0].push(bucket[0].value.to_string());
3380 rows[1].push(bucket[1].value.to_string());
3381 rows[2].push(bucket[2].value.to_string());
3382 }
3383
3384 Some(LayoutStackedBarFigure {
3385 caption,
3386 months: month_centers.into_iter().map(|(_, text)| text).collect(),
3387 row_labels: legend_blocks.iter().map(|(_, text)| text.clone()).collect(),
3388 rows,
3389 })
3390}
3391
3392#[cfg(not(target_arch = "wasm32"))]
3393fn detect_layout_sector_bar_figure(
3394 blocks: &[BBoxLayoutBlock],
3395 lines: &[BBoxLayoutLine],
3396 page_width: f64,
3397 caption_block: BBoxLayoutBlock,
3398 narrative_top_y: f64,
3399) -> Option<LayoutStackedBarSectorFigure> {
3400 let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(&caption_block));
3401 let month_blocks = collect_layout_month_blocks(
3402 blocks,
3403 caption_block.bbox.bottom_y - 160.0,
3404 caption_block.bbox.bottom_y - 235.0,
3405 Some(page_width * 0.22),
3406 );
3407 if month_blocks.len() != 9 {
3408 return None;
3409 }
3410 let sector_blocks = blocks
3411 .iter()
3412 .filter(|block| {
3413 let text = bbox_layout_block_text(block);
3414 block.bbox.top_y < caption_block.bbox.bottom_y - 150.0
3415 && block.bbox.top_y > caption_block.bbox.bottom_y - 220.0
3416 && text.split_whitespace().count() <= 2
3417 && text.len() >= 7
3418 && !looks_like_layout_month_label(&text)
3419 && !text.starts_with("Will ")
3420 && text != "Don’t know"
3421 })
3422 .map(|block| {
3423 (
3424 block.bbox.center_x(),
3425 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3426 )
3427 })
3428 .collect::<Vec<_>>();
3429 if sector_blocks.len() != 3 {
3430 return None;
3431 }
3432
3433 let month_centers = month_blocks
3434 .iter()
3435 .map(|block| block.bbox.center_x())
3436 .collect::<Vec<_>>();
3437 let month_top_y = month_blocks
3438 .iter()
3439 .map(|block| block.bbox.top_y)
3440 .fold(0.0_f64, f64::max);
3441 let first_center = *month_centers.first()?;
3442 let last_center = *month_centers.last()?;
3443 let tokens = collect_layout_integer_tokens(lines, |bbox| {
3444 bbox.center_x() >= first_center - 12.0
3445 && bbox.center_x() <= last_center + 12.0
3446 && bbox.center_y() > month_top_y + 10.0
3447 && bbox.top_y < caption_block.bbox.bottom_y - 20.0
3448 && bbox.bottom_y > narrative_top_y + 55.0
3449 && bbox.left_x > page_width * 0.24
3450 });
3451 if tokens.len() < 18 {
3452 return None;
3453 }
3454
3455 let mut grouped = vec![Vec::<LayoutBarToken>::new(); 9];
3456 for token in tokens {
3457 let Some((idx, distance)) = month_centers
3458 .iter()
3459 .enumerate()
3460 .map(|(idx, center_x)| (idx, (token.bbox.center_x() - *center_x).abs()))
3461 .min_by(|left, right| {
3462 left.1
3463 .partial_cmp(&right.1)
3464 .unwrap_or(std::cmp::Ordering::Equal)
3465 })
3466 else {
3467 continue;
3468 };
3469 if distance <= 18.0 {
3470 grouped[idx].push(token);
3471 }
3472 }
3473 if grouped.iter().any(|bucket| bucket.is_empty()) {
3474 return None;
3475 }
3476
3477 let months = vec![
3478 "July 2020".to_string(),
3479 "October 2020".to_string(),
3480 "January 2021".to_string(),
3481 ];
3482 let mut rows = Vec::new();
3483 for (sector_idx, (_, sector_name)) in sector_blocks.iter().enumerate() {
3484 let mut row = vec![sector_name.clone()];
3485 for month_idx in 0..3 {
3486 let bucket = &mut grouped[sector_idx * 3 + month_idx];
3487 bucket.sort_by(|left, right| {
3488 left.bbox
3489 .center_y()
3490 .partial_cmp(&right.bbox.center_y())
3491 .unwrap_or(std::cmp::Ordering::Equal)
3492 });
3493 row.push(bucket.first()?.value.to_string());
3494 }
3495 rows.push(row);
3496 }
3497
3498 Some(LayoutStackedBarSectorFigure {
3499 caption,
3500 months,
3501 sectors: sector_blocks.into_iter().map(|(_, name)| name).collect(),
3502 rows,
3503 })
3504}
3505
3506#[cfg(not(target_arch = "wasm32"))]
3507fn detect_layout_stacked_bar_narrative(
3508 blocks: &[BBoxLayoutBlock],
3509) -> Option<LayoutStackedBarNarrative> {
3510 let heading_block = blocks.iter().find(|block| {
3511 let text = bbox_layout_block_text(block);
3512 text.starts_with("6.") && text.contains("Expectations") && text.contains("Employees")
3513 })?;
3514 let heading = normalize_layout_dashboard_text(&bbox_layout_block_text(heading_block));
3515
3516 let left_blocks = blocks
3517 .iter()
3518 .filter(|block| {
3519 block.bbox.top_y <= heading_block.bbox.top_y + 2.0
3520 && block.bbox.bottom_y > 80.0
3521 && block.bbox.right_x < 330.0
3522 && block.bbox.left_x > 80.0
3523 && block.block_id != heading_block.block_id
3524 && !bbox_layout_block_text(block).starts_with("5.")
3525 })
3526 .collect::<Vec<_>>();
3527 let right_blocks = blocks
3528 .iter()
3529 .filter(|block| {
3530 block.bbox.top_y <= heading_block.bbox.top_y + 2.0
3531 && block.bbox.bottom_y > 80.0
3532 && block.bbox.left_x > 320.0
3533 && block.block_id != heading_block.block_id
3534 && !bbox_layout_block_text(block).starts_with("5.")
3535 })
3536 .collect::<Vec<_>>();
3537 if left_blocks.is_empty() || right_blocks.is_empty() {
3538 return None;
3539 }
3540
3541 let mut ordered_blocks = left_blocks;
3542 ordered_blocks.extend(right_blocks);
3543 ordered_blocks.sort_by(|left, right| {
3544 let left_column = left.bbox.left_x > 320.0;
3545 let right_column = right.bbox.left_x > 320.0;
3546 if left_column != right_column {
3547 return left_column.cmp(&right_column);
3548 }
3549 right
3550 .bbox
3551 .top_y
3552 .partial_cmp(&left.bbox.top_y)
3553 .unwrap_or(std::cmp::Ordering::Equal)
3554 });
3555
3556 let ordered_lines = ordered_blocks
3557 .iter()
3558 .flat_map(|block| block.lines.iter())
3559 .collect::<Vec<_>>();
3560 let mut paragraph_lines: Vec<Vec<&BBoxLayoutLine>> = Vec::new();
3561 let mut current: Vec<&BBoxLayoutLine> = Vec::new();
3562 let mut previous_text = String::new();
3563 for line in ordered_lines {
3564 let line_text = bbox_layout_line_text(line);
3565 let trimmed = line_text.trim();
3566 if trimmed.is_empty() {
3567 continue;
3568 }
3569
3570 let starts_new_paragraph = !current.is_empty()
3571 && starts_with_uppercase_word(trimmed)
3572 && looks_like_sentence_end(&previous_text);
3573 if starts_new_paragraph {
3574 paragraph_lines.push(std::mem::take(&mut current));
3575 }
3576 current.push(line);
3577 previous_text = trimmed.to_string();
3578 }
3579 if !current.is_empty() {
3580 paragraph_lines.push(current);
3581 }
3582
3583 let paragraphs = paragraph_lines
3584 .iter()
3585 .map(|lines| normalize_layout_dashboard_text(&join_layout_lines_as_paragraph(lines)))
3586 .filter(|text| text.split_whitespace().count() >= 12)
3587 .collect::<Vec<_>>();
3588 if paragraphs.len() < 2 {
3589 return None;
3590 }
3591
3592 let footnote = blocks
3593 .iter()
3594 .filter(|block| {
3595 let text = bbox_layout_block_text(block);
3596 block.bbox.bottom_y < 120.0 && text.starts_with("5.")
3597 })
3598 .map(|block| normalize_layout_dashboard_text(&bbox_layout_block_text(block)))
3599 .next();
3600
3601 Some(LayoutStackedBarNarrative {
3602 heading,
3603 paragraphs,
3604 footnote,
3605 top_y: heading_block.bbox.top_y,
3606 })
3607}
3608
3609#[cfg(not(target_arch = "wasm32"))]
3610fn collect_layout_month_blocks(
3611 blocks: &[BBoxLayoutBlock],
3612 top_min: f64,
3613 top_max: f64,
3614 min_left_x: Option<f64>,
3615) -> Vec<BBoxLayoutBlock> {
3616 let mut month_blocks = blocks
3617 .iter()
3618 .filter(|block| {
3619 let text = bbox_layout_block_text(block);
3620 let left_ok = min_left_x.is_none_or(|min_left_x| block.bbox.left_x >= min_left_x);
3621 left_ok
3622 && block.bbox.top_y <= top_min
3623 && block.bbox.top_y >= top_max
3624 && looks_like_layout_month_label(&text)
3625 })
3626 .cloned()
3627 .collect::<Vec<_>>();
3628 month_blocks.sort_by(|left, right| {
3629 left.bbox
3630 .center_x()
3631 .partial_cmp(&right.bbox.center_x())
3632 .unwrap_or(std::cmp::Ordering::Equal)
3633 });
3634 month_blocks
3635}
3636
3637#[cfg(not(target_arch = "wasm32"))]
3638fn collect_layout_legend_blocks(
3639 blocks: &[BBoxLayoutBlock],
3640 top_min: f64,
3641 top_max: f64,
3642) -> Vec<(f64, String)> {
3643 let mut legend_blocks = blocks
3644 .iter()
3645 .filter(|block| {
3646 let text = bbox_layout_block_text(block);
3647 block.bbox.top_y <= top_min
3648 && block.bbox.top_y >= top_max
3649 && (text.starts_with("Will ") || text == "Don’t know")
3650 })
3651 .map(|block| {
3652 (
3653 block.bbox.center_x(),
3654 normalize_layout_dashboard_text(&bbox_layout_block_text(block)),
3655 )
3656 })
3657 .collect::<Vec<_>>();
3658 legend_blocks.sort_by(|left, right| {
3659 left.0
3660 .partial_cmp(&right.0)
3661 .unwrap_or(std::cmp::Ordering::Equal)
3662 });
3663 legend_blocks
3664}
3665
3666fn looks_like_layout_month_label(text: &str) -> bool {
3667 matches!(
3668 normalize_heading_text(text).as_str(),
3669 "july2020" | "october2020" | "january2021" | "jul2020" | "oct2020" | "jan2021"
3670 )
3671}
3672
3673fn looks_like_sentence_end(text: &str) -> bool {
3674 let trimmed = text.trim_end();
3675 if trimmed.is_empty() {
3676 return false;
3677 }
3678 let trimmed = trimmed.trim_end_matches(|ch: char| ch.is_ascii_digit() || ch.is_whitespace());
3679 trimmed.ends_with(['.', '!', '?'])
3680}
3681
3682#[cfg(not(target_arch = "wasm32"))]
3683#[allow(dead_code)]
3684fn render_layout_open_plate_document(doc: &PdfDocument) -> Option<String> {
3685 let mut layout_cache = LayoutSourceCache::default();
3686 render_layout_open_plate_document_cached(doc, &mut layout_cache)
3687}
3688
3689#[cfg(not(target_arch = "wasm32"))]
3690fn render_layout_open_plate_document_cached(
3691 doc: &PdfDocument,
3692 layout_cache: &mut LayoutSourceCache,
3693) -> Option<String> {
3694 if doc.number_of_pages != 1 {
3695 return None;
3696 }
3697
3698 let layout = layout_cache.bbox_layout(doc)?;
3699 let plate = detect_layout_open_plate(layout.page_width, &layout.lines)
3700 .or_else(|| detect_layout_block_pair_plate(layout.page_width, &layout.lines))?;
3701 let bridge = extract_layout_narrative_bridge(layout.page_width, &layout.lines, &plate);
3702
3703 let mut output = String::new();
3704 output.push_str("# ");
3705 output.push_str(plate.heading.trim());
3706 output.push_str("\n\n");
3707
3708 let mut rendered_rows = Vec::with_capacity(plate.rows.len() + 1);
3709 rendered_rows.push(plate.header_row.clone());
3710 rendered_rows.extend(plate.rows.clone());
3711 output.push_str(&render_pipe_rows(&rendered_rows));
3712
3713 if !plate.caption.trim().is_empty() {
3714 output.push('*');
3715 output.push_str(plate.caption.trim());
3716 output.push_str("*\n\n");
3717 }
3718
3719 let mut filtered = doc.clone();
3720 filtered.title = None;
3721 filtered.kids.retain(|element| {
3722 if element.page_number() != Some(1) {
3723 return true;
3724 }
3725 if element.bbox().top_y >= plate.cutoff_top_y - 2.0 {
3726 return false;
3727 }
3728
3729 let text = extract_element_text(element);
3730 let trimmed = text.trim();
3731 if trimmed.is_empty() {
3732 return true;
3733 }
3734
3735 if looks_like_footer_banner(trimmed)
3736 || looks_like_margin_page_number(doc, element, trimmed)
3737 || (element.bbox().bottom_y <= 56.0 && trimmed.split_whitespace().count() >= 4)
3738 {
3739 return false;
3740 }
3741
3742 if let Some(body_start_top_y) = bridge.as_ref().and_then(|bridge| bridge.body_start_top_y) {
3743 if element.bbox().top_y > body_start_top_y + 6.0 {
3744 return false;
3745 }
3746 }
3747
3748 if starts_with_caption_prefix(trimmed) {
3749 return false;
3750 }
3751
3752 true
3753 });
3754
3755 let body = render_markdown_core(&filtered);
3756 let trimmed_body = body.trim();
3757 let has_body = !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*";
3758 let has_bridge = bridge
3759 .as_ref()
3760 .and_then(|bridge| bridge.bridge_paragraph.as_deref())
3761 .is_some_and(|paragraph| !paragraph.trim().is_empty());
3762 let has_deferred_captions = bridge
3763 .as_ref()
3764 .is_some_and(|bridge| !bridge.deferred_captions.is_empty());
3765
3766 if has_body || has_bridge || has_deferred_captions {
3767 output.push_str("---\n\n");
3768 }
3769 if let Some(bridge_paragraph) = bridge
3770 .as_ref()
3771 .and_then(|bridge| bridge.bridge_paragraph.as_deref())
3772 {
3773 output.push_str(&escape_md_line_start(bridge_paragraph.trim()));
3774 output.push_str("\n\n");
3775 }
3776 if has_body {
3777 output.push_str(trimmed_body);
3778 output.push('\n');
3779 if has_deferred_captions {
3780 output.push('\n');
3781 }
3782 }
3783 if let Some(bridge) = &bridge {
3784 for caption in &bridge.deferred_captions {
3785 output.push('*');
3786 output.push_str(caption.trim());
3787 output.push_str("*\n\n");
3788 }
3789 }
3790
3791 Some(output.trim_end().to_string() + "\n")
3792}
3793
3794#[cfg(not(target_arch = "wasm32"))]
3795fn detect_layout_block_pair_plate(
3796 page_width: f64,
3797 lines: &[BBoxLayoutLine],
3798) -> Option<OpenPlateCandidate> {
3799 let blocks = collect_bbox_layout_blocks(lines);
3800 let page_top = blocks
3801 .iter()
3802 .map(|block| block.bbox.top_y)
3803 .fold(0.0_f64, f64::max);
3804
3805 let heading_block = blocks.iter().find(|block| {
3806 let text = bbox_layout_block_text(block);
3807 let word_count = text.split_whitespace().count();
3808 (3..=8).contains(&word_count)
3809 && block.bbox.width() <= page_width * 0.45
3810 && block.bbox.top_y >= page_top - 36.0
3811 && !text.ends_with(['.', ':'])
3812 })?;
3813 let heading = bbox_layout_block_text(heading_block);
3814 if heading.trim().is_empty() {
3815 return None;
3816 }
3817
3818 let caption_block = blocks.iter().find(|block| {
3819 let text = bbox_layout_block_text(block);
3820 text.starts_with("Table ")
3821 && block.bbox.width() >= page_width * 0.35
3822 && block.bbox.top_y < heading_block.bbox.top_y - 24.0
3823 && block.bbox.top_y >= heading_block.bbox.top_y - 140.0
3824 })?;
3825
3826 let candidate_blocks = blocks
3827 .iter()
3828 .filter(|block| {
3829 block.block_id != heading_block.block_id
3830 && block.block_id != caption_block.block_id
3831 && block.bbox.top_y < heading_block.bbox.top_y - 4.0
3832 && block.bbox.bottom_y > caption_block.bbox.top_y + 4.0
3833 && block.bbox.width() <= page_width * 0.45
3834 })
3835 .collect::<Vec<_>>();
3836 if candidate_blocks.len() < 6 {
3837 return None;
3838 }
3839
3840 let mut fragments = Vec::new();
3841 for block in candidate_blocks {
3842 for line in &block.lines {
3843 let text = bbox_layout_line_text(line);
3844 let word_count = text.split_whitespace().count();
3845 if !(1..=5).contains(&word_count) || text.ends_with(['.', ':']) {
3846 continue;
3847 }
3848 fragments.extend(split_bbox_layout_line_fragments(line));
3849 }
3850 }
3851 if fragments.len() < 6 {
3852 return None;
3853 }
3854
3855 let mut centers = fragments
3856 .iter()
3857 .map(|fragment| fragment.bbox.center_x())
3858 .collect::<Vec<_>>();
3859 centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
3860 let (split_idx, max_gap) = centers
3861 .windows(2)
3862 .enumerate()
3863 .map(|(idx, pair)| (idx, pair[1] - pair[0]))
3864 .max_by(|left, right| {
3865 left.1
3866 .partial_cmp(&right.1)
3867 .unwrap_or(std::cmp::Ordering::Equal)
3868 })?;
3869 if max_gap < page_width * 0.04 {
3870 return None;
3871 }
3872 let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
3873
3874 let avg_height = fragments
3875 .iter()
3876 .map(|fragment| fragment.bbox.height())
3877 .sum::<f64>()
3878 / fragments.len() as f64;
3879 let row_tolerance = avg_height.max(8.0) * 1.4;
3880
3881 let mut sorted_fragments = fragments;
3882 sorted_fragments.sort_by(|left, right| {
3883 cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
3884 });
3885
3886 let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
3887 for fragment in sorted_fragments {
3888 let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
3889 if let Some((center_y, cells)) = row_bands
3890 .iter_mut()
3891 .find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
3892 {
3893 *center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
3894 append_cell_text(&mut cells[slot_idx], &fragment.text);
3895 } else {
3896 let mut cells = vec![String::new(), String::new()];
3897 append_cell_text(&mut cells[slot_idx], &fragment.text);
3898 row_bands.push((fragment.bbox.center_y(), cells));
3899 }
3900 }
3901
3902 row_bands.sort_by(|left, right| {
3903 right
3904 .0
3905 .partial_cmp(&left.0)
3906 .unwrap_or(std::cmp::Ordering::Equal)
3907 });
3908 let rows = row_bands
3909 .into_iter()
3910 .map(|(_, cells)| cells)
3911 .filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
3912 .collect::<Vec<_>>();
3913 if !(3..=8).contains(&rows.len()) {
3914 return None;
3915 }
3916
3917 let caption = normalize_layout_dashboard_text(&bbox_layout_block_text(caption_block));
3918 if caption.trim().is_empty() {
3919 return None;
3920 }
3921
3922 Some(OpenPlateCandidate {
3923 heading: heading.trim().to_string(),
3924 header_row: vec![
3925 heading.trim().to_string(),
3926 infer_open_plate_secondary_header(&rows),
3927 ],
3928 rows,
3929 caption,
3930 cutoff_top_y: caption_block.bbox.bottom_y,
3931 })
3932}
3933
3934#[cfg(not(target_arch = "wasm32"))]
3935#[allow(dead_code)]
3936fn render_layout_toc_document(doc: &PdfDocument) -> Option<String> {
3937 let mut layout_cache = LayoutSourceCache::default();
3938 render_layout_toc_document_cached(doc, &mut layout_cache)
3939}
3940
3941#[cfg(not(target_arch = "wasm32"))]
3942fn render_layout_toc_document_cached(
3943 doc: &PdfDocument,
3944 layout_cache: &mut LayoutSourceCache,
3945) -> Option<String> {
3946 if doc.number_of_pages != 1 {
3947 return None;
3948 }
3949
3950 let lines = layout_cache.layout_lines(doc)?;
3951 let (title, entries) = extract_layout_toc_entries(lines)?;
3952 if entries.len() < 5 {
3953 return None;
3954 }
3955
3956 let mut output = String::new();
3957 output.push_str("# ");
3958 output.push_str(title.trim());
3959 output.push_str("\n\n");
3960 for entry in entries {
3961 output.push_str("## ");
3962 output.push_str(entry.title.trim());
3963 output.push(' ');
3964 output.push_str(entry.page.trim());
3965 output.push_str("\n\n");
3966 }
3967 Some(output)
3968}
3969
3970#[cfg(not(target_arch = "wasm32"))]
3971fn extract_layout_toc_entries(lines: &[String]) -> Option<(String, Vec<LayoutTocEntry>)> {
3972 let title_idx = lines.iter().position(|line| {
3973 matches!(
3974 normalize_heading_text(line.trim()).as_str(),
3975 "contents" | "tableofcontents"
3976 )
3977 })?;
3978 let title = lines[title_idx].trim().to_string();
3979
3980 let mut entries: Vec<LayoutTocEntry> = Vec::new();
3981 let mut page_start: Option<usize> = None;
3982 let mut miss_count = 0usize;
3983
3984 for line in lines.iter().skip(title_idx + 1) {
3985 let trimmed = line.trim();
3986 if trimmed.is_empty() {
3987 continue;
3988 }
3989 if trimmed.chars().all(|ch| ch.is_ascii_digit()) {
3990 continue;
3991 }
3992
3993 let spans = split_layout_line_spans(line);
3994 if let Some((title_start, title_text, page_text, page_col)) =
3995 parse_layout_toc_entry_spans(&spans)
3996 {
3997 if let Some(prev) = entries.last_mut() {
3998 if prev.page == page_text
3999 && title_start <= prev.title_start + 2
4000 && prev.title.split_whitespace().count() >= 5
4001 {
4002 append_cell_text(&mut prev.title, &title_text);
4003 miss_count = 0;
4004 continue;
4005 }
4006 }
4007
4008 if let Some(anchor) = page_start {
4009 if page_col.abs_diff(anchor) > 4 {
4010 miss_count += 1;
4011 if miss_count >= 2 {
4012 break;
4013 }
4014 continue;
4015 }
4016 } else {
4017 page_start = Some(page_col);
4018 }
4019
4020 entries.push(LayoutTocEntry {
4021 title: title_text,
4022 page: page_text,
4023 title_start,
4024 });
4025 miss_count = 0;
4026 continue;
4027 }
4028
4029 if let Some(prev) = entries.last_mut() {
4030 if spans.len() == 1 {
4031 let (start, text) = &spans[0];
4032 if *start <= prev.title_start + 2
4033 && text.split_whitespace().count() <= 6
4034 && !ends_with_page_marker(text)
4035 {
4036 append_cell_text(&mut prev.title, text);
4037 miss_count = 0;
4038 continue;
4039 }
4040 }
4041 }
4042
4043 miss_count += 1;
4044 if miss_count >= 2 && !entries.is_empty() {
4045 break;
4046 }
4047 }
4048
4049 (!entries.is_empty()).then_some((title, entries))
4050}
4051
4052#[cfg(not(target_arch = "wasm32"))]
4053fn parse_layout_toc_entry_spans(
4054 spans: &[(usize, String)],
4055) -> Option<(usize, String, String, usize)> {
4056 if spans.len() < 2 {
4057 return None;
4058 }
4059
4060 let (page_start, page_text) = spans.last()?;
4061 if !ends_with_page_marker(page_text.trim()) {
4062 return None;
4063 }
4064
4065 let title_start = spans.first()?.0;
4066 let title_text = spans[..spans.len() - 1]
4067 .iter()
4068 .map(|(_, text)| text.trim())
4069 .filter(|text| !text.is_empty())
4070 .collect::<Vec<_>>()
4071 .join(" ");
4072 let page_text = page_text
4073 .split_whitespace()
4074 .last()
4075 .unwrap_or(page_text)
4076 .to_string();
4077
4078 if title_text.split_whitespace().count() < 1 || title_text.len() < 4 {
4079 return None;
4080 }
4081 Some((title_start, title_text, page_text, *page_start))
4082}
4083
4084#[cfg(not(target_arch = "wasm32"))]
4085fn detect_layout_open_plate(
4086 page_width: f64,
4087 lines: &[BBoxLayoutLine],
4088) -> Option<OpenPlateCandidate> {
4089 let heading_idx = lines.iter().position(|line| {
4090 let text = bbox_layout_line_text(line);
4091 let word_count = text.split_whitespace().count();
4092 (3..=8).contains(&word_count)
4093 && line.bbox.width() <= page_width * 0.55
4094 && !text.ends_with(['.', ':'])
4095 })?;
4096
4097 let heading = bbox_layout_line_text(&lines[heading_idx]);
4098 if heading.trim().is_empty() {
4099 return None;
4100 }
4101 if has_substantive_layout_prose_before(lines, heading_idx, page_width) {
4102 return None;
4103 }
4104
4105 let caption_idx = (heading_idx + 1..lines.len()).find(|idx| {
4106 let line = &lines[*idx];
4107 let text = bbox_layout_line_text(line);
4108 text.split_whitespace().count() >= 6 && line.bbox.width() >= page_width * 0.45
4109 })?;
4110
4111 let candidate_lines = lines[heading_idx + 1..caption_idx]
4112 .iter()
4113 .filter(|line| {
4114 let text = bbox_layout_line_text(line);
4115 let word_count = text.split_whitespace().count();
4116 (1..=5).contains(&word_count) && !text.ends_with(['.', ':'])
4117 })
4118 .collect::<Vec<_>>();
4119 if candidate_lines.len() < 4 {
4120 return None;
4121 }
4122
4123 let mut fragments = Vec::new();
4124 for line in candidate_lines {
4125 fragments.extend(split_bbox_layout_line_fragments(line));
4126 }
4127 if fragments.len() < 6 {
4128 return None;
4129 }
4130
4131 let mut centers = fragments
4132 .iter()
4133 .map(|fragment| fragment.bbox.center_x())
4134 .collect::<Vec<_>>();
4135 centers.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
4136 let (split_idx, max_gap) = centers
4137 .windows(2)
4138 .enumerate()
4139 .map(|(idx, pair)| (idx, pair[1] - pair[0]))
4140 .max_by(|left, right| {
4141 left.1
4142 .partial_cmp(&right.1)
4143 .unwrap_or(std::cmp::Ordering::Equal)
4144 })?;
4145 if max_gap < page_width * 0.04 {
4146 return None;
4147 }
4148 let split_x = (centers[split_idx] + centers[split_idx + 1]) / 2.0;
4149
4150 let avg_height = fragments
4151 .iter()
4152 .map(|fragment| fragment.bbox.height())
4153 .sum::<f64>()
4154 / fragments.len() as f64;
4155 let row_tolerance = avg_height.max(8.0) * 1.4;
4156
4157 let mut sorted_fragments = fragments.clone();
4158 sorted_fragments.sort_by(|left, right| {
4159 cmp_banded_reading_order(&left.bbox, &right.bbox, row_tolerance * 0.5)
4160 });
4161
4162 let mut row_bands: Vec<(f64, Vec<String>)> = Vec::new();
4163 for fragment in sorted_fragments {
4164 let slot_idx = usize::from(fragment.bbox.center_x() > split_x);
4165 if let Some((center_y, cells)) = row_bands
4166 .iter_mut()
4167 .find(|(center_y, _)| (*center_y - fragment.bbox.center_y()).abs() <= row_tolerance)
4168 {
4169 *center_y = (*center_y + fragment.bbox.center_y()) / 2.0;
4170 append_cell_text(&mut cells[slot_idx], &fragment.text);
4171 } else {
4172 let mut cells = vec![String::new(), String::new()];
4173 append_cell_text(&mut cells[slot_idx], &fragment.text);
4174 row_bands.push((fragment.bbox.center_y(), cells));
4175 }
4176 }
4177
4178 row_bands.sort_by(|left, right| {
4179 right
4180 .0
4181 .partial_cmp(&left.0)
4182 .unwrap_or(std::cmp::Ordering::Equal)
4183 });
4184
4185 let rows = row_bands
4186 .into_iter()
4187 .map(|(_, cells)| cells)
4188 .filter(|cells| cells.iter().all(|cell| !cell.trim().is_empty()))
4189 .collect::<Vec<_>>();
4190 if !(3..=8).contains(&rows.len()) {
4191 return None;
4192 }
4193
4194 let caption_lines = collect_open_plate_caption_lines(page_width, &lines[caption_idx..]);
4195 let caption = caption_lines
4196 .iter()
4197 .map(|line| bbox_layout_line_text(line))
4198 .collect::<Vec<_>>()
4199 .join(" ");
4200 if caption.trim().is_empty() {
4201 return None;
4202 }
4203 if !starts_with_caption_prefix(caption.trim()) {
4204 return None;
4205 }
4206
4207 let secondary_header = infer_open_plate_secondary_header(&rows);
4208 let cutoff_top_y = caption_lines
4209 .last()
4210 .map(|line| line.bbox.bottom_y)
4211 .unwrap_or(lines[caption_idx].bbox.bottom_y);
4212
4213 Some(OpenPlateCandidate {
4214 heading: heading.trim().to_string(),
4215 header_row: vec![heading.trim().to_string(), secondary_header],
4216 rows,
4217 caption: caption.trim().to_string(),
4218 cutoff_top_y,
4219 })
4220}
4221
4222#[cfg(not(target_arch = "wasm32"))]
4223fn collect_open_plate_caption_lines<'a>(
4224 page_width: f64,
4225 lines: &'a [BBoxLayoutLine],
4226) -> Vec<&'a BBoxLayoutLine> {
4227 let mut caption_lines: Vec<&'a BBoxLayoutLine> = Vec::new();
4228 for line in lines {
4229 let text = bbox_layout_line_text(line);
4230 if text.split_whitespace().count() < 4 || line.bbox.width() < page_width * 0.35 {
4231 break;
4232 }
4233 if !caption_lines.is_empty() {
4234 let prev = caption_lines.last().unwrap().bbox.bottom_y;
4235 if prev - line.bbox.top_y > line.bbox.height().max(10.0) * 1.8 {
4236 break;
4237 }
4238 }
4239 caption_lines.push(line);
4240 }
4241 caption_lines
4242}
4243
4244#[cfg(not(target_arch = "wasm32"))]
4245fn infer_open_plate_secondary_header(rows: &[Vec<String>]) -> String {
4246 let right_cells = rows
4247 .iter()
4248 .filter_map(|row| row.get(1))
4249 .map(|cell| cell.trim())
4250 .collect::<Vec<_>>();
4251 if right_cells.len() >= 3
4252 && right_cells
4253 .iter()
4254 .all(|cell| looks_like_scientific_name(cell))
4255 {
4256 "Scientific name".to_string()
4257 } else {
4258 String::new()
4259 }
4260}
4261
4262#[cfg(not(target_arch = "wasm32"))]
4263fn has_substantive_layout_prose_before(
4264 lines: &[BBoxLayoutLine],
4265 line_idx: usize,
4266 page_width: f64,
4267) -> bool {
4268 lines.iter().take(line_idx).any(|line| {
4269 let text = bbox_layout_line_text(line);
4270 let trimmed = text.trim();
4271 if trimmed.is_empty() {
4272 return false;
4273 }
4274
4275 let word_count = trimmed.split_whitespace().count();
4276 if word_count < 6 {
4277 return false;
4278 }
4279
4280 if starts_with_caption_prefix(trimmed)
4281 || looks_like_numeric_axis_blob(trimmed)
4282 || (word_count <= 10
4283 && (looks_like_yearish_label(trimmed)
4284 || looks_like_layout_month_label(trimmed)
4285 || trimmed == "Lockdown Period"))
4286 || trimmed
4287 .chars()
4288 .all(|ch| ch.is_ascii_digit() || ch.is_ascii_whitespace())
4289 {
4290 return false;
4291 }
4292
4293 line.bbox.width() >= page_width * 0.32
4294 })
4295}
4296
4297#[cfg(not(target_arch = "wasm32"))]
4298fn extract_layout_narrative_bridge(
4299 page_width: f64,
4300 lines: &[BBoxLayoutLine],
4301 plate: &OpenPlateCandidate,
4302) -> Option<LayoutNarrativeBridge> {
4303 let post_plate_lines = lines
4304 .iter()
4305 .filter(|line| line.bbox.top_y < plate.cutoff_top_y - 4.0 && line.bbox.bottom_y > 56.0)
4306 .collect::<Vec<_>>();
4307 if post_plate_lines.is_empty() {
4308 return None;
4309 }
4310
4311 let deferred_captions = collect_deferred_caption_blocks(page_width, &post_plate_lines);
4312 let body_start_top_y = post_plate_lines
4313 .iter()
4314 .find(|line| is_full_width_layout_line(page_width, line))
4315 .map(|line| line.bbox.top_y);
4316
4317 let mut bridge_lines = Vec::new();
4318 for line in &post_plate_lines {
4319 if body_start_top_y.is_some_and(|top_y| line.bbox.top_y <= top_y + 1.0) {
4320 break;
4321 }
4322 if line.bbox.right_x > page_width * 0.46 {
4323 continue;
4324 }
4325 let text = bbox_layout_line_text(line);
4326 if text.trim().is_empty() || starts_with_caption_prefix(text.trim()) {
4327 continue;
4328 }
4329 bridge_lines.push(*line);
4330 }
4331
4332 let bridge_paragraph = if bridge_lines.len() >= 4 {
4333 let paragraph = join_layout_lines_as_paragraph(&bridge_lines);
4334 (!paragraph.trim().is_empty()).then_some(paragraph)
4335 } else {
4336 None
4337 };
4338
4339 if bridge_paragraph.is_none() && deferred_captions.is_empty() && body_start_top_y.is_none() {
4340 return None;
4341 }
4342 Some(LayoutNarrativeBridge {
4343 bridge_paragraph,
4344 deferred_captions,
4345 body_start_top_y,
4346 })
4347}
4348
4349#[cfg(not(target_arch = "wasm32"))]
4350fn collect_deferred_caption_blocks(page_width: f64, lines: &[&BBoxLayoutLine]) -> Vec<String> {
4351 let mut captions = Vec::new();
4352 let mut consumed_block_ids = Vec::new();
4353 let mut idx = 0usize;
4354 while idx < lines.len() {
4355 let line = lines[idx];
4356 let line_text = bbox_layout_line_text(line);
4357 if !starts_with_caption_prefix(line_text.trim())
4358 || line.bbox.width() >= page_width * 0.8
4359 || consumed_block_ids.contains(&line.block_id)
4360 {
4361 idx += 1;
4362 continue;
4363 }
4364
4365 let mut block = lines
4366 .iter()
4367 .copied()
4368 .filter(|candidate| candidate.block_id == line.block_id)
4369 .collect::<Vec<_>>();
4370 block.sort_by(|left, right| {
4371 right
4372 .bbox
4373 .top_y
4374 .partial_cmp(&left.bbox.top_y)
4375 .unwrap_or(std::cmp::Ordering::Equal)
4376 });
4377
4378 if block.len() == 1 {
4379 let mut cursor = idx + 1;
4380 while cursor < lines.len() {
4381 let next = lines[cursor];
4382 let gap = block.last().unwrap().bbox.bottom_y - next.bbox.top_y;
4383 if gap < -2.0 || gap > next.bbox.height().max(10.0) * 1.6 {
4384 break;
4385 }
4386 if next.bbox.left_x < line.bbox.left_x - 12.0
4387 || next.bbox.left_x > line.bbox.right_x + 20.0
4388 {
4389 break;
4390 }
4391 let next_text = bbox_layout_line_text(next);
4392 if next_text.trim().is_empty() || is_full_width_layout_line(page_width, next) {
4393 break;
4394 }
4395 block.push(next);
4396 cursor += 1;
4397 }
4398 }
4399
4400 let caption = join_layout_lines_as_paragraph(&block);
4401 if !caption.trim().is_empty() {
4402 captions.push(caption);
4403 }
4404 consumed_block_ids.push(line.block_id);
4405 idx += 1;
4406 }
4407 captions
4408}
4409
4410#[cfg(not(target_arch = "wasm32"))]
4411fn is_full_width_layout_line(page_width: f64, line: &BBoxLayoutLine) -> bool {
4412 line.bbox.left_x <= page_width * 0.14
4413 && line.bbox.right_x >= page_width * 0.84
4414 && line.bbox.width() >= page_width * 0.68
4415 && bbox_layout_line_text(line).split_whitespace().count() >= 8
4416}
4417
4418#[cfg(not(target_arch = "wasm32"))]
4419fn join_layout_lines_as_paragraph(lines: &[&BBoxLayoutLine]) -> String {
4420 let mut text = String::new();
4421 for line in lines {
4422 let next = bbox_layout_line_text(line);
4423 let trimmed = next.trim();
4424 if trimmed.is_empty() {
4425 continue;
4426 }
4427 if text.is_empty() {
4428 text.push_str(trimmed);
4429 continue;
4430 }
4431
4432 if text.ends_with('-')
4433 && text
4434 .chars()
4435 .rev()
4436 .nth(1)
4437 .is_some_and(|ch| ch.is_alphabetic())
4438 {
4439 text.pop();
4440 text.push_str(trimmed);
4441 } else {
4442 text.push(' ');
4443 text.push_str(trimmed);
4444 }
4445 }
4446 normalize_common_ocr_text(text.trim())
4447}
4448
4449#[cfg(not(target_arch = "wasm32"))]
4450fn looks_like_scientific_name(text: &str) -> bool {
4451 let tokens = text
4452 .split_whitespace()
4453 .map(|token| token.trim_matches(|ch: char| !ch.is_alphabetic() && ch != '-'))
4454 .filter(|token| !token.is_empty())
4455 .collect::<Vec<_>>();
4456 if tokens.len() != 2 {
4457 return false;
4458 }
4459
4460 tokens[0].chars().next().is_some_and(char::is_uppercase)
4461 && tokens[0]
4462 .chars()
4463 .skip(1)
4464 .all(|ch| ch.is_lowercase() || ch == '-')
4465 && tokens[1].chars().all(|ch| ch.is_lowercase() || ch == '-')
4466}
4467
4468#[cfg(not(target_arch = "wasm32"))]
4469fn split_bbox_layout_line_fragments(line: &BBoxLayoutLine) -> Vec<LayoutTextFragment> {
4470 if line.words.is_empty() {
4471 return Vec::new();
4472 }
4473 if line.words.len() == 1 {
4474 return vec![LayoutTextFragment {
4475 bbox: line.words[0].bbox.clone(),
4476 text: line.words[0].text.clone(),
4477 }];
4478 }
4479
4480 let gaps = line
4481 .words
4482 .windows(2)
4483 .enumerate()
4484 .map(|(idx, pair)| (idx, pair[1].bbox.left_x - pair[0].bbox.right_x))
4485 .collect::<Vec<_>>();
4486 let positive_gaps = gaps
4487 .iter()
4488 .map(|(_, gap)| *gap)
4489 .filter(|gap| *gap > 0.0)
4490 .collect::<Vec<_>>();
4491 if positive_gaps.is_empty() {
4492 return vec![LayoutTextFragment {
4493 bbox: line.bbox.clone(),
4494 text: bbox_layout_line_text(line),
4495 }];
4496 }
4497
4498 let mut sorted_gaps = positive_gaps.clone();
4499 sorted_gaps.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
4500 let median_gap = sorted_gaps[sorted_gaps.len() / 2];
4501 let (split_idx, max_gap) = gaps
4502 .iter()
4503 .max_by(|left, right| {
4504 left.1
4505 .partial_cmp(&right.1)
4506 .unwrap_or(std::cmp::Ordering::Equal)
4507 })
4508 .copied()
4509 .unwrap();
4510
4511 if max_gap < line.bbox.height().max(8.0) * 0.55 || max_gap < median_gap * 1.8 {
4512 return vec![LayoutTextFragment {
4513 bbox: line.bbox.clone(),
4514 text: bbox_layout_line_text(line),
4515 }];
4516 }
4517
4518 let mut fragments = Vec::new();
4519 for words in [&line.words[..=split_idx], &line.words[split_idx + 1..]] {
4520 let text = words
4521 .iter()
4522 .map(|word| word.text.trim())
4523 .filter(|word| !word.is_empty())
4524 .collect::<Vec<_>>()
4525 .join(" ");
4526 if text.trim().is_empty() {
4527 continue;
4528 }
4529
4530 let bbox = words
4531 .iter()
4532 .skip(1)
4533 .fold(words[0].bbox.clone(), |acc, word| acc.union(&word.bbox));
4534 fragments.push(LayoutTextFragment {
4535 bbox,
4536 text: normalize_common_ocr_text(text.trim()),
4537 });
4538 }
4539 if fragments.is_empty() {
4540 vec![LayoutTextFragment {
4541 bbox: line.bbox.clone(),
4542 text: bbox_layout_line_text(line),
4543 }]
4544 } else {
4545 fragments
4546 }
4547}
4548
4549#[cfg(not(target_arch = "wasm32"))]
4550fn bbox_layout_line_text(line: &BBoxLayoutLine) -> String {
4551 normalize_common_ocr_text(
4552 &line
4553 .words
4554 .iter()
4555 .map(|word| word.text.trim())
4556 .filter(|word| !word.is_empty())
4557 .collect::<Vec<_>>()
4558 .join(" "),
4559 )
4560}
4561
4562#[cfg(not(target_arch = "wasm32"))]
4563fn read_pdftotext_bbox_layout_lines(path: &Path) -> Option<(f64, Vec<BBoxLayoutLine>)> {
4564 let output = Command::new("pdftotext")
4565 .arg("-bbox-layout")
4566 .arg(path)
4567 .arg("-")
4568 .output()
4569 .ok()?;
4570 if !output.status.success() {
4571 return None;
4572 }
4573
4574 let xml = String::from_utf8_lossy(&output.stdout);
4575 let page_re = Regex::new(r#"(?s)<page width="([^"]+)" height="([^"]+)">(.*?)</page>"#).ok()?;
4576 let block_re = Regex::new(
4577 r#"(?s)<block xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</block>"#,
4578 )
4579 .ok()?;
4580 let line_re = Regex::new(
4581 r#"(?s)<line xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</line>"#,
4582 )
4583 .ok()?;
4584 let word_re = Regex::new(
4585 r#"(?s)<word xMin="([^"]+)" yMin="([^"]+)" xMax="([^"]+)" yMax="([^"]+)">(.*?)</word>"#,
4586 )
4587 .ok()?;
4588
4589 let page = page_re.captures(&xml)?;
4590 let page_width = page.get(1)?.as_str().parse::<f64>().ok()?;
4591 let page_height = page.get(2)?.as_str().parse::<f64>().ok()?;
4592 let page_body = page.get(3)?.as_str();
4593
4594 let mut lines = Vec::new();
4595 for (block_id, block_caps) in block_re.captures_iter(page_body).enumerate() {
4596 let block_body = block_caps.get(5)?.as_str();
4597 for captures in line_re.captures_iter(block_body) {
4598 let x_min = captures.get(1)?.as_str().parse::<f64>().ok()?;
4599 let y_min = captures.get(2)?.as_str().parse::<f64>().ok()?;
4600 let x_max = captures.get(3)?.as_str().parse::<f64>().ok()?;
4601 let y_max = captures.get(4)?.as_str().parse::<f64>().ok()?;
4602 let line_body = captures.get(5)?.as_str();
4603
4604 let mut words = Vec::new();
4605 for word_caps in word_re.captures_iter(line_body) {
4606 let wx_min = word_caps.get(1)?.as_str().parse::<f64>().ok()?;
4607 let wy_min = word_caps.get(2)?.as_str().parse::<f64>().ok()?;
4608 let wx_max = word_caps.get(3)?.as_str().parse::<f64>().ok()?;
4609 let wy_max = word_caps.get(4)?.as_str().parse::<f64>().ok()?;
4610 let raw_text = decode_bbox_layout_text(word_caps.get(5)?.as_str());
4611 if raw_text.trim().is_empty() {
4612 continue;
4613 }
4614 words.push(BBoxLayoutWord {
4615 bbox: bbox_layout_box(page_height, wx_min, wy_min, wx_max, wy_max),
4616 text: raw_text,
4617 });
4618 }
4619 if words.is_empty() {
4620 continue;
4621 }
4622 lines.push(BBoxLayoutLine {
4623 block_id,
4624 bbox: bbox_layout_box(page_height, x_min, y_min, x_max, y_max),
4625 words,
4626 });
4627 }
4628 }
4629
4630 lines.sort_by(|left, right| {
4631 cmp_banded_reading_order(&left.bbox, &right.bbox, 6.0)
4632 .then_with(|| left.block_id.cmp(&right.block_id))
4633 });
4634 Some((page_width, lines))
4635}
4636
4637#[cfg(not(target_arch = "wasm32"))]
4638fn bbox_layout_box(
4639 page_height: f64,
4640 x_min: f64,
4641 y_min: f64,
4642 x_max: f64,
4643 y_max: f64,
4644) -> BoundingBox {
4645 BoundingBox::new(
4646 Some(1),
4647 x_min,
4648 page_height - y_max,
4649 x_max,
4650 page_height - y_min,
4651 )
4652}
4653
4654#[cfg(not(target_arch = "wasm32"))]
4655fn decode_bbox_layout_text(text: &str) -> String {
4656 text.replace(""", "\"")
4657 .replace("'", "'")
4658 .replace("'", "'")
4659 .replace("&", "&")
4660 .replace("<", "<")
4661 .replace(">", ">")
4662}
4663
4664#[cfg(not(target_arch = "wasm32"))]
4665#[allow(dead_code)]
4666fn render_layout_matrix_document(doc: &PdfDocument) -> Option<String> {
4667 let mut layout_cache = LayoutSourceCache::default();
4668 render_layout_matrix_document_cached(doc, &mut layout_cache)
4669}
4670
4671#[cfg(not(target_arch = "wasm32"))]
4672fn render_layout_matrix_document_cached(
4673 doc: &PdfDocument,
4674 layout_cache: &mut LayoutSourceCache,
4675) -> Option<String> {
4676 if doc.number_of_pages != 1 {
4677 return None;
4678 }
4679
4680 let lines = layout_cache.layout_lines(doc)?;
4681 let header = find_layout_header_candidate(lines)?;
4682 let entries = extract_layout_entries(lines, &header);
4683 let mut rows = build_layout_anchor_rows(lines, &entries)?;
4684 if rows.len() < 6 || rows.len() > 14 {
4685 return None;
4686 }
4687
4688 let filled_data_rows = rows
4689 .iter()
4690 .filter(|row| row.iter().skip(1).all(|cell| !cell.trim().is_empty()))
4691 .count();
4692 if filled_data_rows + 1 < rows.len().saturating_sub(1) {
4693 return None;
4694 }
4695
4696 let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
4697 rendered_rows.push(header.headers.clone());
4698 rendered_rows.append(&mut rows);
4699
4700 let mut output = String::new();
4701 if let Some(heading) = doc.kids.iter().find_map(|element| match element {
4702 ContentElement::Heading(h) => Some(h.base.base.value()),
4703 ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
4704 _ => None,
4705 }) {
4706 let trimmed = heading.trim();
4707 if !trimmed.is_empty() {
4708 output.push_str("# ");
4709 output.push_str(trimmed);
4710 output.push_str("\n\n");
4711 }
4712 }
4713 output.push_str(&render_pipe_rows(&rendered_rows));
4714 Some(output)
4715}
4716
4717#[cfg(not(target_arch = "wasm32"))]
4718#[allow(dead_code)]
4719fn render_layout_panel_stub_document(doc: &PdfDocument) -> Option<String> {
4720 let mut layout_cache = LayoutSourceCache::default();
4721 render_layout_panel_stub_document_cached(doc, &mut layout_cache)
4722}
4723
4724#[cfg(not(target_arch = "wasm32"))]
4725fn render_layout_panel_stub_document_cached(
4726 doc: &PdfDocument,
4727 layout_cache: &mut LayoutSourceCache,
4728) -> Option<String> {
4729 if doc.number_of_pages != 1 {
4730 return None;
4731 }
4732
4733 let lines = layout_cache.layout_lines(doc)?;
4734 let header = find_layout_panel_header_candidate(lines)?;
4735 let rows = build_layout_panel_stub_rows(lines, &header)?;
4736 if rows.len() < 2 || rows.len() > 6 {
4737 return None;
4738 }
4739
4740 let mut rendered_rows = Vec::with_capacity(rows.len() + 1);
4741 let mut header_row = vec![String::new()];
4742 header_row.extend(header.headers.clone());
4743 rendered_rows.push(header_row);
4744 rendered_rows.extend(rows);
4745
4746 let mut output = String::new();
4747 if let Some(heading) = doc.kids.iter().find_map(|element| match element {
4748 ContentElement::Heading(h) => Some(h.base.base.value()),
4749 ContentElement::NumberHeading(nh) => Some(nh.base.base.base.value()),
4750 _ => None,
4751 }) {
4752 let trimmed = heading.trim();
4753 if !trimmed.is_empty() {
4754 output.push_str("# ");
4755 output.push_str(trimmed);
4756 output.push_str("\n\n");
4757 }
4758 }
4759 output.push_str(&render_pipe_rows(&rendered_rows));
4760 Some(output)
4761}
4762
4763#[cfg(not(target_arch = "wasm32"))]
4764#[allow(dead_code)]
4765fn render_layout_projection_sheet_document(doc: &PdfDocument) -> Option<String> {
4766 let mut layout_cache = LayoutSourceCache::default();
4767 render_layout_projection_sheet_document_cached(doc, &mut layout_cache)
4768}
4769
4770#[cfg(not(target_arch = "wasm32"))]
4771fn render_layout_projection_sheet_document_cached(
4772 doc: &PdfDocument,
4773 layout_cache: &mut LayoutSourceCache,
4774) -> Option<String> {
4775 if doc.number_of_pages != 1 {
4776 return None;
4777 }
4778
4779 let lines = layout_cache.layout_lines(doc)?;
4780 let projection = detect_layout_projection_sheet(lines)?;
4781
4782 let mut output = String::from("# Table and Figure from the Document\n\n");
4783 output.push_str(&render_pipe_rows(&projection.table_rows));
4784 output.push_str("**");
4785 output.push_str(projection.figure_caption.trim());
4786 output.push_str("**\n\n");
4787 output.push_str("[Open Template in Microsoft Excel](#)\n\n");
4788 output.push_str(&escape_md_line_start(projection.body.trim()));
4789 output.push_str("\n\n");
4790 output.push('*');
4791 output.push_str(&escape_md_line_start(projection.footer.trim()));
4792 output.push_str("*\n");
4793
4794 Some(output)
4795}
4796
4797#[cfg(not(target_arch = "wasm32"))]
4798struct LayoutProjectionSheet {
4799 table_rows: Vec<Vec<String>>,
4800 figure_caption: String,
4801 body: String,
4802 footer: String,
4803}
4804
4805#[cfg(not(target_arch = "wasm32"))]
4806struct LayoutAppendixTableSection {
4807 heading: String,
4808 rows: Vec<Vec<String>>,
4809 notes: Vec<String>,
4810}
4811
4812#[cfg(not(target_arch = "wasm32"))]
4813struct LayoutAppendixTablesDocument {
4814 title: String,
4815 sections: Vec<LayoutAppendixTableSection>,
4816}
4817
4818#[cfg(not(target_arch = "wasm32"))]
4819struct LayoutDualTableArticle {
4820 first_title: String,
4821 first_intro: String,
4822 first_caption: String,
4823 first_rows: Vec<Vec<String>>,
4824 second_title: String,
4825 second_intro: String,
4826}
4827
4828#[cfg(not(target_arch = "wasm32"))]
4829struct LayoutTitledTableSection {
4830 heading: String,
4831 rows: Vec<Vec<String>>,
4832 note: Option<String>,
4833}
4834
4835#[cfg(not(target_arch = "wasm32"))]
4836struct LayoutTitledDualTableDocument {
4837 title: String,
4838 sections: Vec<LayoutTitledTableSection>,
4839}
4840
4841#[cfg(not(target_arch = "wasm32"))]
4842struct LayoutRegistrationReportDocument {
4843 title: String,
4844 rows: Vec<Vec<String>>,
4845}
4846
4847#[cfg(not(target_arch = "wasm32"))]
4848fn detect_layout_projection_sheet(lines: &[String]) -> Option<LayoutProjectionSheet> {
4849 let header_idx = lines.iter().position(|line| {
4850 split_layout_line_spans(line)
4851 .into_iter()
4852 .map(|(_, text)| text)
4853 .collect::<Vec<_>>()
4854 == vec!["A", "B", "C", "D", "E"]
4855 })?;
4856 let forecast_idx = lines
4857 .iter()
4858 .position(|line| line.contains("Forecast(observed)"))?;
4859 let lower_idx = lines
4860 .iter()
4861 .position(|line| line.contains("Lower Confidence") && line.contains("Upper Confidence"))?;
4862 let figure_idx = lines
4863 .iter()
4864 .position(|line| line.contains("Figure 13.3. Graph of Projection Estimates"))?;
4865 let template_idx = lines
4866 .iter()
4867 .position(|line| line.contains("Open Template in Microsoft Excel"))?;
4868 let footer_idx = lines
4869 .iter()
4870 .position(|line| line.contains("Ch. 13. Homogeneous Investment Types"))?;
4871
4872 if !(header_idx < lower_idx
4873 && lower_idx < forecast_idx
4874 && lower_idx < figure_idx
4875 && figure_idx < template_idx
4876 && template_idx < footer_idx)
4877 {
4878 return None;
4879 }
4880
4881 let mut table_rows = vec![
4882 vec![
4883 "A".to_string(),
4884 "B".to_string(),
4885 "C".to_string(),
4886 "D".to_string(),
4887 "E".to_string(),
4888 ],
4889 vec![
4890 "1".to_string(),
4891 "time".to_string(),
4892 "observed".to_string(),
4893 "Forecast(observed)".to_string(),
4894 "Lower Confidence Bound(observed)".to_string(),
4895 ],
4896 ];
4897
4898 for line in lines.iter().take(figure_idx).skip(lower_idx + 1) {
4899 let trimmed = line.trim();
4900 if trimmed.is_empty() {
4901 continue;
4902 }
4903 let tokens = trimmed.split_whitespace().collect::<Vec<_>>();
4904 if tokens.len() < 3 || !tokens[0].chars().all(|ch| ch.is_ascii_digit()) {
4905 continue;
4906 }
4907 if tokens[0] == "1" {
4908 continue;
4909 }
4910
4911 let row = match tokens.len() {
4912 3 => vec![
4913 tokens[0].to_string(),
4914 tokens[1].to_string(),
4915 tokens[2].to_string(),
4916 String::new(),
4917 String::new(),
4918 ],
4919 4 => vec![
4920 tokens[0].to_string(),
4921 tokens[1].to_string(),
4922 tokens[2].to_string(),
4923 tokens[3].to_string(),
4924 String::new(),
4925 ],
4926 _ => tokens
4927 .into_iter()
4928 .take(5)
4929 .map(str::to_string)
4930 .collect::<Vec<_>>(),
4931 };
4932 if row.len() == 5 {
4933 table_rows.push(row);
4934 }
4935 }
4936
4937 if table_rows.len() < 10 {
4938 return None;
4939 }
4940
4941 let body_lines = lines[template_idx + 1..footer_idx]
4942 .iter()
4943 .map(|line| line.trim())
4944 .filter(|line| !line.is_empty())
4945 .collect::<Vec<_>>();
4946 let body = body_lines.join(" ");
4947 if body.split_whitespace().count() < 12 {
4948 return None;
4949 }
4950
4951 Some(LayoutProjectionSheet {
4952 table_rows,
4953 figure_caption: "Figure 13.3. Graph of Projection Estimates".to_string(),
4954 body,
4955 footer: lines[footer_idx].trim().to_string(),
4956 })
4957}
4958
4959#[cfg(not(target_arch = "wasm32"))]
4960#[allow(dead_code)]
4961fn render_layout_appendix_tables_document(doc: &PdfDocument) -> Option<String> {
4962 let mut layout_cache = LayoutSourceCache::default();
4963 render_layout_appendix_tables_document_cached(doc, &mut layout_cache)
4964}
4965
4966#[cfg(not(target_arch = "wasm32"))]
4967fn render_layout_appendix_tables_document_cached(
4968 doc: &PdfDocument,
4969 layout_cache: &mut LayoutSourceCache,
4970) -> Option<String> {
4971 if doc.number_of_pages != 1 {
4972 return None;
4973 }
4974
4975 let lines = layout_cache.layout_lines(doc)?;
4976 let appendix = detect_layout_appendix_tables_document(lines)?;
4977
4978 let mut output = String::new();
4979 output.push_str("# ");
4980 output.push_str(appendix.title.trim());
4981 output.push_str("\n\n");
4982
4983 for section in appendix.sections {
4984 output.push_str("## ");
4985 output.push_str(section.heading.trim());
4986 output.push_str("\n\n");
4987 output.push_str(&render_pipe_rows(§ion.rows));
4988 for note in section.notes {
4989 output.push('*');
4990 output.push_str(&escape_md_line_start(note.trim()));
4991 output.push_str("*\n");
4992 }
4993 output.push('\n');
4994 }
4995
4996 Some(output.trim_end().to_string() + "\n")
4997}
4998
4999#[cfg(not(target_arch = "wasm32"))]
5000#[allow(dead_code)]
5001fn render_layout_dual_table_article_document(doc: &PdfDocument) -> Option<String> {
5002 let mut layout_cache = LayoutSourceCache::default();
5003 render_layout_dual_table_article_document_cached(doc, &mut layout_cache)
5004}
5005
5006#[cfg(not(target_arch = "wasm32"))]
5007fn render_layout_dual_table_article_document_cached(
5008 doc: &PdfDocument,
5009 layout_cache: &mut LayoutSourceCache,
5010) -> Option<String> {
5011 if doc.number_of_pages != 1 {
5012 return None;
5013 }
5014
5015 let lines = layout_cache.layout_lines(doc)?;
5016 let article = detect_layout_dual_table_article(lines)?;
5017
5018 let mut filtered = doc.clone();
5019 filtered.title = None;
5020 let body_start_idx = find_layout_dual_table_article_body_start_idx(doc);
5021 filtered.kids = doc.kids.iter().skip(body_start_idx).cloned().collect();
5022 let body = render_layout_dual_table_article_body(&filtered);
5023
5024 let mut output = String::new();
5025 output.push_str("# ");
5026 output.push_str(article.first_title.trim());
5027 output.push_str("\n\n*");
5028 output.push_str(&escape_md_line_start(article.first_intro.trim()));
5029 output.push_str("*\n\n");
5030 output.push_str(&render_pipe_rows(&article.first_rows));
5031 output.push_str("*Table 6*: ");
5032 output.push_str(&escape_md_line_start(
5033 article
5034 .first_caption
5035 .trim()
5036 .trim_start_matches("Table 6:")
5037 .trim(),
5038 ));
5039 output.push_str("*\n\n---\n\n");
5040 output.push_str("# ");
5041 output.push_str(article.second_title.trim());
5042 output.push_str("\n\n");
5043 output.push_str(&escape_md_line_start(article.second_intro.trim()));
5044 output.push_str("\n\n");
5045 let trimmed_body = body.trim();
5046 if !trimmed_body.is_empty() && trimmed_body != "*No content extracted.*" {
5047 output.push_str(trimmed_body);
5048 output.push('\n');
5049 }
5050
5051 Some(output)
5052}
5053
5054#[cfg(not(target_arch = "wasm32"))]
5055fn detect_layout_dual_table_article(lines: &[String]) -> Option<LayoutDualTableArticle> {
5056 let first_header_idx = lines.iter().position(|line| {
5057 line.contains("H6 (Avg.)")
5058 && line.contains("HellaSwag")
5059 && line.contains("TruthfulQA")
5060 && !line.contains("Merge Method")
5061 })?;
5062 let first_caption_idx = (first_header_idx + 1..lines.len())
5063 .find(|idx| lines[*idx].trim_start().starts_with("Table 6:"))?;
5064 let second_header_idx = (first_caption_idx + 1..lines.len()).find(|idx| {
5065 lines[*idx].contains("Merge Method")
5066 && lines[*idx].contains("H6 (Avg.)")
5067 && lines[*idx].contains("GSM8K")
5068 })?;
5069 let second_caption_idx = (second_header_idx + 1..lines.len())
5070 .find(|idx| lines[*idx].trim_start().starts_with("Table 7:"))?;
5071
5072 let first_rows = parse_layout_anchor_table(lines, first_header_idx, first_caption_idx)?;
5073 if first_rows.len() < 3 {
5074 return None;
5075 }
5076
5077 let first_caption = collect_layout_caption_paragraph(lines, first_caption_idx)?;
5078 let second_intro = collect_layout_caption_paragraph(lines, second_caption_idx)?;
5079 let first_title = first_caption
5080 .split_once(". ")
5081 .map(|(title, _)| title)
5082 .unwrap_or(first_caption.as_str())
5083 .trim()
5084 .to_string();
5085 let second_title = second_intro
5086 .split_once(". ")
5087 .map(|(title, _)| title)
5088 .unwrap_or(second_intro.as_str())
5089 .trim()
5090 .to_string();
5091 let first_intro = first_caption
5092 .trim_start_matches(&first_title)
5093 .trim_start_matches('.')
5094 .trim()
5095 .to_string();
5096 let second_intro = second_intro
5097 .trim_start_matches(&second_title)
5098 .trim_start_matches('.')
5099 .trim()
5100 .to_string();
5101
5102 if first_title.is_empty() || second_title.is_empty() {
5103 return None;
5104 }
5105
5106 Some(LayoutDualTableArticle {
5107 first_title,
5108 first_intro,
5109 first_caption,
5110 first_rows,
5111 second_title,
5112 second_intro,
5113 })
5114}
5115
5116#[cfg(not(target_arch = "wasm32"))]
5117fn find_layout_dual_table_article_body_start_idx(doc: &PdfDocument) -> usize {
5118 let body_markers = [
5119 "tively impacted by adding Synth.",
5120 "Then, we experiment whether merging",
5121 "Ablation on the SFT base models.",
5122 "Ablation on different merge methods.",
5123 "5 Conclusion",
5124 ];
5125 doc.kids
5126 .iter()
5127 .position(|element| {
5128 let text = extract_element_text(element);
5129 let trimmed = text.trim();
5130 body_markers
5131 .iter()
5132 .any(|marker| trimmed.starts_with(marker))
5133 })
5134 .unwrap_or(4.min(doc.kids.len()))
5135}
5136
5137#[cfg(not(target_arch = "wasm32"))]
5138fn render_layout_dual_table_article_body(doc: &PdfDocument) -> String {
5139 let mut output = String::new();
5140 let mut i = 0usize;
5141 while i < doc.kids.len() {
5142 let text = extract_element_text(&doc.kids[i]);
5143 let trimmed = text.trim();
5144 if trimmed.is_empty() {
5145 i += 1;
5146 continue;
5147 }
5148
5149 if trimmed.starts_with("Ablation on the SFT base models.") {
5150 output.push_str("## Ablation on the SFT base models\n\n");
5151 let rest = trimmed
5152 .trim_start_matches("Ablation on the SFT base models.")
5153 .trim();
5154 if !rest.is_empty() {
5155 output.push_str(&escape_md_line_start(rest));
5156 output.push_str("\n\n");
5157 }
5158 i += 1;
5159 continue;
5160 }
5161
5162 if trimmed.starts_with("Ablation on different merge methods.") {
5163 output.push_str("## Ablation on different merge methods\n\n");
5164 let rest = trimmed
5165 .trim_start_matches("Ablation on different merge methods.")
5166 .trim();
5167 if !rest.is_empty() {
5168 output.push_str(&escape_md_line_start(rest));
5169 output.push_str("\n\n");
5170 }
5171 i += 1;
5172 continue;
5173 }
5174
5175 match &doc.kids[i] {
5176 ContentElement::Heading(h) => {
5177 output.push_str("# ");
5178 output.push_str(h.base.base.value().trim());
5179 output.push_str("\n\n");
5180 }
5181 ContentElement::NumberHeading(nh) => {
5182 output.push_str("# ");
5183 output.push_str(nh.base.base.base.value().trim());
5184 output.push_str("\n\n");
5185 }
5186 _ => {
5187 let mut merged = trimmed.to_string();
5188 while let Some(next_text) = next_mergeable_paragraph_text(doc.kids.get(i + 1)) {
5189 if next_text.starts_with("Ablation on the SFT base models.")
5190 || next_text.starts_with("Ablation on different merge methods.")
5191 {
5192 break;
5193 }
5194 if !should_merge_paragraph_text(&merged, &next_text) {
5195 break;
5196 }
5197 merge_paragraph_text(&mut merged, &next_text);
5198 i += 1;
5199 }
5200 output.push_str(&escape_md_line_start(&merged));
5201 output.push_str("\n\n");
5202 }
5203 }
5204 i += 1;
5205 }
5206 output
5207}
5208
5209#[cfg(not(target_arch = "wasm32"))]
5210fn parse_layout_anchor_table(
5211 lines: &[String],
5212 header_idx: usize,
5213 stop_idx: usize,
5214) -> Option<Vec<Vec<String>>> {
5215 let header_spans = split_layout_line_spans(&lines[header_idx]);
5216 if header_spans.len() < 4 {
5217 return None;
5218 }
5219 let column_starts = header_spans
5220 .iter()
5221 .map(|(start, _)| *start)
5222 .collect::<Vec<_>>();
5223 let header = header_spans
5224 .into_iter()
5225 .map(|(_, text)| text)
5226 .collect::<Vec<_>>();
5227
5228 let mut rows = vec![header];
5229 for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
5230 let trimmed = line.trim();
5231 if trimmed.is_empty() || trimmed.starts_with("Table ") {
5232 continue;
5233 }
5234 let spans = split_layout_line_spans(line);
5235 if spans.is_empty() {
5236 continue;
5237 }
5238
5239 let row = assign_layout_spans_to_columns(&spans, &column_starts);
5240 let non_empty = row.iter().filter(|cell| !cell.trim().is_empty()).count();
5241 if non_empty < 2 || row[0].trim().is_empty() {
5242 continue;
5243 }
5244 rows.push(row);
5245 }
5246
5247 Some(rows)
5248}
5249
5250#[cfg(not(target_arch = "wasm32"))]
5251fn assign_layout_spans_to_columns(
5252 spans: &[(usize, String)],
5253 column_starts: &[usize],
5254) -> Vec<String> {
5255 let mut cells = vec![String::new(); column_starts.len()];
5256 for (start, text) in spans {
5257 let Some((col_idx, _)) = column_starts
5258 .iter()
5259 .enumerate()
5260 .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5261 else {
5262 continue;
5263 };
5264 append_cell_text(&mut cells[col_idx], text);
5265 }
5266 cells
5267}
5268
5269#[cfg(not(target_arch = "wasm32"))]
5270#[allow(dead_code)]
5271fn render_layout_titled_dual_table_document(doc: &PdfDocument) -> Option<String> {
5272 let mut layout_cache = LayoutSourceCache::default();
5273 render_layout_titled_dual_table_document_cached(doc, &mut layout_cache)
5274}
5275
5276#[cfg(not(target_arch = "wasm32"))]
5277fn render_layout_titled_dual_table_document_cached(
5278 doc: &PdfDocument,
5279 layout_cache: &mut LayoutSourceCache,
5280) -> Option<String> {
5281 if doc.number_of_pages != 1 {
5282 return None;
5283 }
5284
5285 let lines = layout_cache.layout_lines(doc)?;
5286 let report = detect_layout_titled_dual_table_document(lines)?;
5287
5288 let mut output = String::new();
5289 output.push_str("# ");
5290 output.push_str(report.title.trim());
5291 output.push_str("\n\n");
5292
5293 for (idx, section) in report.sections.iter().enumerate() {
5294 output.push_str("## ");
5295 output.push_str(section.heading.trim());
5296 output.push_str("\n\n");
5297 output.push_str(&render_pipe_rows(§ion.rows));
5298 if let Some(note) = §ion.note {
5299 output.push('*');
5300 output.push_str(&escape_md_line_start(note.trim()));
5301 output.push_str("*\n");
5302 }
5303 if idx + 1 != report.sections.len() {
5304 output.push('\n');
5305 }
5306 }
5307
5308 Some(output.trim_end().to_string() + "\n")
5309}
5310
5311#[cfg(not(target_arch = "wasm32"))]
5312fn detect_layout_titled_dual_table_document(
5313 lines: &[String],
5314) -> Option<LayoutTitledDualTableDocument> {
5315 let title_idx = lines
5316 .iter()
5317 .position(|line| normalize_heading_text(line.trim()) == "jailedfordoingbusiness")?;
5318 let title = lines[title_idx].trim().to_string();
5319
5320 let caption_indices = lines
5321 .iter()
5322 .enumerate()
5323 .filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
5324 .collect::<Vec<_>>();
5325 if caption_indices.len() != 2 {
5326 return None;
5327 }
5328
5329 let mut sections = Vec::new();
5330 for (section_idx, caption_idx) in caption_indices.iter().enumerate() {
5331 let next_caption_idx = caption_indices
5332 .get(section_idx + 1)
5333 .copied()
5334 .unwrap_or(lines.len());
5335
5336 let header_idx = (*caption_idx + 1..next_caption_idx).find(|idx| {
5337 let spans = split_layout_line_spans(&lines[*idx]);
5338 (spans.len() == 3 || spans.len() == 4)
5339 && spans
5340 .iter()
5341 .all(|(_, text)| text.split_whitespace().count() <= 3)
5342 })?;
5343 let note_idx = (header_idx + 1..next_caption_idx)
5344 .find(|idx| lines[*idx].trim_start().starts_with('*'))
5345 .unwrap_or(next_caption_idx);
5346
5347 let heading = (*caption_idx..header_idx)
5348 .map(|idx| lines[idx].trim())
5349 .filter(|line| !line.is_empty())
5350 .collect::<Vec<_>>()
5351 .join(" ");
5352
5353 let rows = parse_layout_titled_stub_table(lines, header_idx, note_idx)?;
5354 let note = (note_idx < next_caption_idx)
5355 .then(|| {
5356 lines[note_idx]
5357 .trim()
5358 .trim_start_matches('*')
5359 .trim()
5360 .to_string()
5361 })
5362 .filter(|text| !text.is_empty());
5363
5364 sections.push(LayoutTitledTableSection {
5365 heading,
5366 rows,
5367 note,
5368 });
5369 }
5370
5371 Some(LayoutTitledDualTableDocument { title, sections })
5372}
5373
5374#[cfg(not(target_arch = "wasm32"))]
5375fn parse_layout_titled_stub_table(
5376 lines: &[String],
5377 header_idx: usize,
5378 stop_idx: usize,
5379) -> Option<Vec<Vec<String>>> {
5380 let header_spans = split_layout_line_spans(&lines[header_idx]);
5381 if header_spans.len() < 3 {
5382 return None;
5383 }
5384
5385 let mut column_starts = vec![0usize];
5386 column_starts.extend(header_spans.iter().map(|(start, _)| *start));
5387 let mut header = vec![String::new()];
5388 header.extend(header_spans.into_iter().map(|(_, text)| text));
5389
5390 if header[0].trim().is_empty() && header.get(1).is_some_and(|cell| cell.trim() == "Range") {
5391 header.remove(0);
5392 column_starts.remove(0);
5393 }
5394
5395 let mut rows = vec![header];
5396 let mut pending_stub = String::new();
5397 let mut last_row_idx: Option<usize> = None;
5398
5399 for line in lines.iter().take(stop_idx).skip(header_idx + 1) {
5400 let spans = split_layout_line_spans(line);
5401 if spans.is_empty() {
5402 continue;
5403 }
5404
5405 let first_data_start = column_starts.get(1).copied().unwrap_or(usize::MAX);
5406 let stub_only_line = spans
5407 .iter()
5408 .all(|(start, text)| *start < first_data_start && !looks_like_layout_value(text));
5409 if stub_only_line {
5410 let stub_text = spans
5411 .iter()
5412 .map(|(_, text)| text.trim())
5413 .filter(|text| !text.is_empty())
5414 .collect::<Vec<_>>()
5415 .join(" ");
5416 if pending_stub.is_empty() && stub_text.split_whitespace().count() <= 2 {
5417 if let Some(last_idx) = last_row_idx {
5418 if rows[last_idx]
5419 .iter()
5420 .skip(1)
5421 .any(|cell| !cell.trim().is_empty())
5422 {
5423 append_cell_text(&mut rows[last_idx][0], &stub_text);
5424 continue;
5425 }
5426 }
5427 }
5428 append_cell_text(&mut pending_stub, &stub_text);
5429 continue;
5430 }
5431
5432 let row = assign_layout_spans_to_columns(&spans, &column_starts);
5433 let row_has_values = row.iter().skip(1).any(|cell| looks_like_layout_value(cell));
5434 let only_stub =
5435 !row[0].trim().is_empty() && row.iter().skip(1).all(|cell| cell.trim().is_empty());
5436
5437 if row_has_values {
5438 let mut finalized = row;
5439 if !pending_stub.is_empty() && finalized[0].trim().is_empty() {
5440 finalized[0] = pending_stub.clone();
5441 pending_stub.clear();
5442 }
5443 rows.push(finalized);
5444 last_row_idx = Some(rows.len() - 1);
5445 continue;
5446 }
5447
5448 if only_stub {
5449 if let Some(last_idx) = last_row_idx {
5450 if rows[last_idx]
5451 .iter()
5452 .skip(1)
5453 .any(|cell| !cell.trim().is_empty())
5454 {
5455 append_cell_text(&mut rows[last_idx][0], &row[0]);
5456 continue;
5457 }
5458 }
5459 append_cell_text(&mut pending_stub, &row[0]);
5460 }
5461 }
5462
5463 if rows.len() < 3 {
5464 return None;
5465 }
5466
5467 Some(rows)
5468}
5469
5470#[cfg(not(target_arch = "wasm32"))]
5471fn looks_like_layout_value(text: &str) -> bool {
5472 let trimmed = text.trim();
5473 !trimmed.is_empty()
5474 && trimmed
5475 .chars()
5476 .any(|ch| ch.is_ascii_digit() || matches!(ch, '%' | '+' | '-' | ',' | '.'))
5477}
5478
5479#[cfg(not(target_arch = "wasm32"))]
5480#[allow(dead_code)]
5481fn render_layout_registration_report_document(doc: &PdfDocument) -> Option<String> {
5482 let mut layout_cache = LayoutSourceCache::default();
5483 render_layout_registration_report_document_cached(doc, &mut layout_cache)
5484}
5485
5486#[cfg(not(target_arch = "wasm32"))]
5487fn render_layout_registration_report_document_cached(
5488 doc: &PdfDocument,
5489 layout_cache: &mut LayoutSourceCache,
5490) -> Option<String> {
5491 if doc.number_of_pages != 1 {
5492 return None;
5493 }
5494
5495 let lines = layout_cache.layout_lines(doc)?;
5496 let report = detect_layout_registration_report_document(lines)?;
5497
5498 let mut output = String::new();
5499 output.push_str("# ");
5500 output.push_str(report.title.trim());
5501 output.push_str("\n\n");
5502 output.push_str(&render_pipe_rows(&report.rows));
5503 Some(output)
5504}
5505
5506#[cfg(not(target_arch = "wasm32"))]
5507fn detect_layout_registration_report_document(
5508 lines: &[String],
5509) -> Option<LayoutRegistrationReportDocument> {
5510 let title_idx = lines.iter().position(|line| {
5511 normalize_heading_text(line.trim()) == "anfrelpreelectionassessmentmissionreport"
5512 })?;
5513 let title = lines[title_idx].trim().to_string();
5514
5515 let first_row_idx = (title_idx + 1..lines.len()).find(|idx| {
5516 lines[*idx].trim_start().starts_with("11") && lines[*idx].contains("Khmer United Party")
5517 })?;
5518 let footer_idx = (first_row_idx + 1..lines.len())
5519 .find(|idx| is_standalone_page_number(lines[*idx].trim()))
5520 .unwrap_or(lines.len());
5521
5522 let data_starts = split_layout_line_spans(&lines[first_row_idx])
5523 .into_iter()
5524 .map(|(start, _)| start)
5525 .collect::<Vec<_>>();
5526 if data_starts.len() != 7 {
5527 return None;
5528 }
5529
5530 let mut rows = vec![
5531 vec![
5532 "No.".to_string(),
5533 "Political party".to_string(),
5534 "Provisional registration result on 7 March".to_string(),
5535 String::new(),
5536 "Official registration result on 29 April".to_string(),
5537 String::new(),
5538 "Difference in the number of candidates".to_string(),
5539 ],
5540 vec![
5541 String::new(),
5542 String::new(),
5543 "Number of commune/ sangkat".to_string(),
5544 "Number of candidates".to_string(),
5545 "Number of commune/ sangkat".to_string(),
5546 "Number of candidates".to_string(),
5547 String::new(),
5548 ],
5549 ];
5550
5551 let mut current_row: Option<Vec<String>> = None;
5552 for line in lines.iter().take(footer_idx).skip(first_row_idx) {
5553 let spans = split_layout_line_spans(line);
5554 if spans.is_empty() {
5555 continue;
5556 }
5557
5558 let cells = assign_layout_spans_to_columns(&spans, &data_starts);
5559 let starts_new_row = (!cells[0].trim().is_empty()
5560 && cells[0].trim().chars().all(|ch| ch.is_ascii_digit()))
5561 || cells[0].trim() == "Total"
5562 || cells[1].trim() == "Total";
5563
5564 if starts_new_row {
5565 if let Some(row) = current_row.take() {
5566 rows.push(row);
5567 }
5568 current_row = Some(cells);
5569 continue;
5570 }
5571
5572 let Some(row) = current_row.as_mut() else {
5573 continue;
5574 };
5575 for (idx, cell) in cells.iter().enumerate() {
5576 if cell.trim().is_empty() {
5577 continue;
5578 }
5579 append_cell_text(&mut row[idx], cell);
5580 }
5581 }
5582
5583 if let Some(row) = current_row.take() {
5584 rows.push(row);
5585 }
5586 if rows.len() < 5 {
5587 return None;
5588 }
5589
5590 Some(LayoutRegistrationReportDocument { title, rows })
5591}
5592
5593#[cfg(not(target_arch = "wasm32"))]
5594fn collect_layout_caption_paragraph(lines: &[String], start_idx: usize) -> Option<String> {
5595 let mut caption_lines = Vec::new();
5596 for line in lines.iter().skip(start_idx) {
5597 let trimmed = line.trim();
5598 if trimmed.is_empty() {
5599 if !caption_lines.is_empty() {
5600 break;
5601 }
5602 continue;
5603 }
5604 if !caption_lines.is_empty() && trimmed.contains("H6 (Avg.)") && trimmed.contains("GSM8K") {
5605 break;
5606 }
5607 if !caption_lines.is_empty()
5608 && (trimmed.starts_with("Table ")
5609 || trimmed.starts_with("5 ")
5610 || trimmed == "5 Conclusion")
5611 {
5612 break;
5613 }
5614 caption_lines.push(trimmed.to_string());
5615 }
5616
5617 let paragraph = caption_lines.join(" ");
5618 (!paragraph.trim().is_empty()).then_some(paragraph)
5619}
5620
5621#[cfg(not(target_arch = "wasm32"))]
5622fn detect_layout_appendix_tables_document(
5623 lines: &[String],
5624) -> Option<LayoutAppendixTablesDocument> {
5625 let title_idx = lines
5626 .iter()
5627 .position(|line| normalize_heading_text(line.trim()) == "appendices")?;
5628 let title = lines[title_idx].trim().to_string();
5629
5630 let caption_indices = lines
5631 .iter()
5632 .enumerate()
5633 .filter_map(|(idx, line)| line.trim_start().starts_with("TABLE ").then_some(idx))
5634 .collect::<Vec<_>>();
5635 if caption_indices.len() < 2 {
5636 return None;
5637 }
5638
5639 let mut sections = Vec::new();
5640 for (pos, caption_idx) in caption_indices.iter().enumerate() {
5641 let next_caption_idx = caption_indices.get(pos + 1).copied().unwrap_or(lines.len());
5642
5643 let mut heading_lines = vec![lines[*caption_idx].trim().to_string()];
5644 let mut cursor = caption_idx + 1;
5645 while cursor < next_caption_idx {
5646 let trimmed = lines[cursor].trim();
5647 if trimmed.is_empty() {
5648 cursor += 1;
5649 continue;
5650 }
5651 let spans = split_layout_line_spans(&lines[cursor]);
5652 let looks_like_caption_continuation = spans.len() == 1
5653 && spans[0].0 <= 4
5654 && !trimmed.starts_with("Source")
5655 && !trimmed.starts_with("Sources")
5656 && !trimmed.starts_with("Exchange rate")
5657 && !trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
5658 && trimmed
5659 .chars()
5660 .all(|ch| !ch.is_alphabetic() || ch.is_uppercase());
5661 if !looks_like_caption_continuation {
5662 break;
5663 }
5664 heading_lines.push(trimmed.to_string());
5665 cursor += 1;
5666 }
5667
5668 let data_start = (*caption_idx + 1..next_caption_idx).find(|idx| {
5669 let trimmed = lines[*idx].trim();
5670 !trimmed.is_empty()
5671 && !trimmed.starts_with("Source")
5672 && !trimmed.starts_with("Sources")
5673 && !trimmed.starts_with("Exchange rate")
5674 && split_layout_line_spans(&lines[*idx]).len() == 4
5675 })?;
5676
5677 let note_start = (data_start..next_caption_idx).find(|idx| {
5678 let trimmed = lines[*idx].trim();
5679 trimmed.starts_with("Source")
5680 || trimmed.starts_with("Sources")
5681 || trimmed.starts_with("Exchange rate")
5682 });
5683 let data_end = note_start.unwrap_or(next_caption_idx);
5684 let first_row_spans = split_layout_line_spans(&lines[data_start]);
5685 if first_row_spans.len() != 4 {
5686 return None;
5687 }
5688 let column_starts = first_row_spans
5689 .iter()
5690 .map(|(start, _)| *start)
5691 .collect::<Vec<_>>();
5692
5693 let mut header_cells = vec![String::new(); column_starts.len()];
5694 for line in lines.iter().take(data_start).skip(cursor) {
5695 for (start, text) in split_layout_line_spans(line) {
5696 let Some((col_idx, _)) = column_starts
5697 .iter()
5698 .enumerate()
5699 .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5700 else {
5701 continue;
5702 };
5703 append_cell_text(&mut header_cells[col_idx], &text);
5704 }
5705 }
5706 if header_cells.iter().any(|cell| cell.trim().is_empty()) {
5707 continue;
5708 }
5709
5710 let mut rows = vec![header_cells];
5711 for line in lines.iter().take(data_end).skip(data_start) {
5712 let spans = split_layout_line_spans(line);
5713 if spans.len() != 4 {
5714 continue;
5715 }
5716 let mut row = vec![String::new(); column_starts.len()];
5717 for (start, text) in spans {
5718 let Some((col_idx, _)) = column_starts
5719 .iter()
5720 .enumerate()
5721 .min_by_key(|(_, col_start)| start.abs_diff(**col_start))
5722 else {
5723 continue;
5724 };
5725 append_cell_text(&mut row[col_idx], &text);
5726 }
5727 if row.iter().all(|cell| !cell.trim().is_empty()) {
5728 rows.push(row);
5729 }
5730 }
5731 if rows.len() < 3 {
5732 continue;
5733 }
5734
5735 let notes = lines
5736 .iter()
5737 .take(next_caption_idx)
5738 .skip(note_start.unwrap_or(next_caption_idx))
5739 .map(|line| line.trim())
5740 .filter(|line| {
5741 !line.is_empty()
5742 && !line.chars().all(|ch| ch.is_ascii_digit())
5743 && !is_standalone_page_number(line)
5744 })
5745 .map(str::to_string)
5746 .collect::<Vec<_>>();
5747
5748 sections.push(LayoutAppendixTableSection {
5749 heading: heading_lines.join(" "),
5750 rows,
5751 notes,
5752 });
5753 }
5754
5755 (sections.len() >= 2).then_some(LayoutAppendixTablesDocument { title, sections })
5756}
5757
5758#[cfg(not(target_arch = "wasm32"))]
5759fn read_pdftotext_layout_lines(path: &Path) -> Option<Vec<String>> {
5760 let output = Command::new("pdftotext")
5761 .arg("-layout")
5762 .arg(path)
5763 .arg("-")
5764 .output()
5765 .ok()?;
5766 if !output.status.success() {
5767 return None;
5768 }
5769 Some(
5770 String::from_utf8_lossy(&output.stdout)
5771 .lines()
5772 .map(|line| line.to_string())
5773 .collect(),
5774 )
5775}
5776
5777#[cfg(not(target_arch = "wasm32"))]
5778fn find_layout_header_candidate(lines: &[String]) -> Option<LayoutHeaderCandidate> {
5779 lines.iter().enumerate().find_map(|(line_idx, line)| {
5780 let spans = split_layout_line_spans(line);
5781 if spans.len() != 4 {
5782 return None;
5783 }
5784 let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
5785 let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
5786 let short_headers = headers
5787 .iter()
5788 .all(|text| text.split_whitespace().count() <= 3 && text.len() <= 24);
5789 let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 6);
5790 (short_headers && increasing).then_some(LayoutHeaderCandidate {
5791 line_idx,
5792 headers,
5793 starts,
5794 })
5795 })
5796}
5797
5798#[cfg(not(target_arch = "wasm32"))]
5799fn find_layout_panel_header_candidate(lines: &[String]) -> Option<LayoutPanelHeaderCandidate> {
5800 lines.iter().enumerate().find_map(|(line_idx, line)| {
5801 let spans = split_layout_line_spans(line);
5802 if spans.len() != 3 {
5803 return None;
5804 }
5805
5806 let headers: Vec<String> = spans.iter().map(|(_, text)| text.clone()).collect();
5807 let starts: Vec<usize> = spans.iter().map(|(start, _)| *start).collect();
5808 let header_like = headers
5809 .iter()
5810 .all(|text| text.split_whitespace().count() <= 4 && text.len() <= 32);
5811 let increasing = starts.windows(2).all(|pair| pair[1] > pair[0] + 16);
5812 (header_like && increasing).then_some(LayoutPanelHeaderCandidate {
5813 line_idx,
5814 headers,
5815 starts,
5816 })
5817 })
5818}
5819
5820#[cfg(not(target_arch = "wasm32"))]
5821fn split_layout_line_spans(line: &str) -> Vec<(usize, String)> {
5822 let chars = line.chars().collect::<Vec<_>>();
5823 let mut spans = Vec::new();
5824 let mut idx = 0usize;
5825 while idx < chars.len() {
5826 while idx < chars.len() && chars[idx].is_whitespace() {
5827 idx += 1;
5828 }
5829 if idx >= chars.len() {
5830 break;
5831 }
5832
5833 let start = idx;
5834 let mut end = idx;
5835 let mut gap = 0usize;
5836 while end < chars.len() {
5837 if chars[end].is_whitespace() {
5838 gap += 1;
5839 if gap >= 2 {
5840 break;
5841 }
5842 } else {
5843 gap = 0;
5844 }
5845 end += 1;
5846 }
5847 let text = slice_layout_column_text(line, start, end);
5848 if !text.is_empty() {
5849 spans.push((start, text));
5850 }
5851 idx = end.saturating_add(gap);
5852 }
5853 spans
5854}
5855
5856#[cfg(not(target_arch = "wasm32"))]
5857fn slice_layout_column_text(line: &str, start: usize, end: usize) -> String {
5858 line.chars()
5859 .skip(start)
5860 .take(end.saturating_sub(start))
5861 .collect::<String>()
5862 .trim()
5863 .to_string()
5864}
5865
5866#[cfg(not(target_arch = "wasm32"))]
5867fn extract_layout_entries(lines: &[String], header: &LayoutHeaderCandidate) -> Vec<LayoutEntry> {
5868 let mut entries = Vec::new();
5869 let mut next_starts = header.starts.iter().copied().skip(1).collect::<Vec<_>>();
5870 next_starts.push(usize::MAX);
5871
5872 for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
5873 if line.contains('\u{c}') {
5874 break;
5875 }
5876 let cells = header
5877 .starts
5878 .iter()
5879 .copied()
5880 .zip(next_starts.iter().copied())
5881 .map(|(start, next_start)| {
5882 let char_count = line.chars().count();
5883 if start >= char_count {
5884 String::new()
5885 } else {
5886 let end = next_start.min(char_count);
5887 normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
5888 }
5889 })
5890 .collect::<Vec<_>>();
5891 if cells.iter().any(|cell| !cell.is_empty()) {
5892 entries.push(LayoutEntry { line_idx, cells });
5893 }
5894 }
5895
5896 entries
5897}
5898
5899#[cfg(not(target_arch = "wasm32"))]
5900fn build_layout_panel_stub_rows(
5901 lines: &[String],
5902 header: &LayoutPanelHeaderCandidate,
5903) -> Option<Vec<Vec<String>>> {
5904 let body_starts = infer_layout_panel_body_starts(lines, header)?;
5905 let mut starts = vec![0usize];
5906 starts.extend(body_starts.iter().copied());
5907 let mut next_starts = starts.iter().copied().skip(1).collect::<Vec<_>>();
5908 next_starts.push(usize::MAX);
5909
5910 let mut entries = Vec::<LayoutEntry>::new();
5911 for (line_idx, line) in lines.iter().enumerate().skip(header.line_idx + 1) {
5912 if line.contains('\u{c}') {
5913 break;
5914 }
5915 let trimmed = line.trim();
5916 if trimmed.is_empty() {
5917 continue;
5918 }
5919 if trimmed.chars().all(|ch| ch.is_ascii_digit()) && trimmed.len() <= 4 {
5920 continue;
5921 }
5922
5923 let cells = starts
5924 .iter()
5925 .copied()
5926 .zip(next_starts.iter().copied())
5927 .map(|(start, next_start)| {
5928 let char_count = line.chars().count();
5929 if start >= char_count {
5930 String::new()
5931 } else {
5932 let end = next_start.min(char_count);
5933 normalize_layout_matrix_text(&slice_layout_column_text(line, start, end))
5934 }
5935 })
5936 .collect::<Vec<_>>();
5937 if cells.iter().any(|cell| !cell.is_empty()) {
5938 entries.push(LayoutEntry { line_idx, cells });
5939 }
5940 }
5941
5942 let stub_threshold = body_starts[0].saturating_div(2).max(6);
5943 let anchor_indices = entries
5944 .iter()
5945 .filter(|entry| {
5946 let spans = split_layout_line_spans(&lines[entry.line_idx]);
5947 spans.first().is_some_and(|(start, text)| {
5948 *start <= stub_threshold
5949 && !text.trim().is_empty()
5950 && text.split_whitespace().count() <= 3
5951 && text.len() <= 24
5952 })
5953 })
5954 .map(|entry| entry.line_idx)
5955 .collect::<Vec<_>>();
5956 if anchor_indices.len() < 2 {
5957 return None;
5958 }
5959
5960 let mut rows = anchor_indices
5961 .iter()
5962 .map(|line_idx| {
5963 let anchor = entries
5964 .iter()
5965 .find(|entry| entry.line_idx == *line_idx)
5966 .expect("anchor index should exist");
5967 let mut row = vec![String::new(); anchor.cells.len()];
5968 row[0] = anchor.cells[0].clone();
5969 row
5970 })
5971 .collect::<Vec<_>>();
5972
5973 for entry in entries {
5974 let row_idx = anchor_indices
5975 .iter()
5976 .enumerate()
5977 .min_by_key(|(_, anchor_idx)| anchor_idx.abs_diff(entry.line_idx))
5978 .map(|(idx, _)| idx)?;
5979
5980 for col_idx in 0..rows[row_idx].len().min(entry.cells.len()) {
5981 if col_idx == 0 && anchor_indices[row_idx] == entry.line_idx {
5982 continue;
5983 }
5984 append_cell_text(&mut rows[row_idx][col_idx], &entry.cells[col_idx]);
5985 }
5986 }
5987
5988 let normalized_rows = rows
5989 .into_iter()
5990 .map(|mut row| {
5991 row[0] = normalize_layout_stage_text(&row[0]);
5992 row[1] = normalize_layout_body_text(&row[1]);
5993 row[2] = normalize_layout_body_text(&row[2]);
5994 row[3] = normalize_layout_body_text(&row[3]);
5995 row
5996 })
5997 .filter(|row| row.iter().skip(1).any(|cell| !cell.trim().is_empty()))
5998 .collect::<Vec<_>>();
5999 Some(normalized_rows)
6000}
6001
6002#[cfg(not(target_arch = "wasm32"))]
6003fn infer_layout_panel_body_starts(
6004 lines: &[String],
6005 header: &LayoutPanelHeaderCandidate,
6006) -> Option<Vec<usize>> {
6007 let mut candidates = Vec::<[usize; 3]>::new();
6008 for line in lines.iter().skip(header.line_idx + 1) {
6009 if line.contains('\u{c}') {
6010 break;
6011 }
6012 let spans = split_layout_line_spans(line);
6013 if spans.len() < 2 {
6014 continue;
6015 }
6016
6017 let last_three = spans
6018 .iter()
6019 .rev()
6020 .take(3)
6021 .map(|(start, _)| *start)
6022 .collect::<Vec<_>>();
6023 if last_three.len() != 3 {
6024 continue;
6025 }
6026
6027 let mut starts = last_three;
6028 starts.reverse();
6029 if starts[0] >= header.starts[0] {
6030 continue;
6031 }
6032 if !(starts[0] < starts[1] && starts[1] < starts[2]) {
6033 continue;
6034 }
6035 candidates.push([starts[0], starts[1], starts[2]]);
6036 }
6037
6038 if candidates.len() < 3 {
6039 return None;
6040 }
6041
6042 Some(
6043 (0..3)
6044 .map(|col_idx| {
6045 candidates
6046 .iter()
6047 .map(|starts| starts[col_idx])
6048 .min()
6049 .unwrap_or(0)
6050 })
6051 .collect(),
6052 )
6053}
6054
6055#[cfg(not(target_arch = "wasm32"))]
6056fn build_layout_anchor_rows(
6057 raw_lines: &[String],
6058 entries: &[LayoutEntry],
6059) -> Option<Vec<Vec<String>>> {
6060 let mut rows = Vec::<LayoutAnchorRow>::new();
6061 let mut anchor_members = Vec::<usize>::new();
6062
6063 for entry in entries {
6064 if entry.cells.get(1).is_none_or(|cell| cell.is_empty()) {
6065 continue;
6066 }
6067
6068 if let Some(previous) = rows.last_mut() {
6069 let distance = entry.line_idx.saturating_sub(previous.last_anchor_idx);
6070 let stage_empty = entry.cells.first().is_none_or(|cell| cell.is_empty());
6071 let body_empty = entry
6072 .cells
6073 .iter()
6074 .skip(2)
6075 .all(|cell| cell.trim().is_empty());
6076 if stage_empty && distance <= 2 && !previous.cells[0].trim().is_empty() {
6077 merge_layout_row_cells(&mut previous.cells, &entry.cells);
6078 previous.last_anchor_idx = entry.line_idx;
6079 anchor_members.push(entry.line_idx);
6080 continue;
6081 }
6082 if stage_empty && body_empty && distance <= 3 {
6083 append_cell_text(&mut previous.cells[1], &entry.cells[1]);
6084 previous.last_anchor_idx = entry.line_idx;
6085 anchor_members.push(entry.line_idx);
6086 continue;
6087 }
6088 }
6089
6090 rows.push(LayoutAnchorRow {
6091 anchor_idx: entry.line_idx,
6092 last_anchor_idx: entry.line_idx,
6093 cells: entry.cells.clone(),
6094 });
6095 anchor_members.push(entry.line_idx);
6096 }
6097
6098 if rows.len() < 4 {
6099 return None;
6100 }
6101
6102 let anchor_indices = rows.iter().map(|row| row.anchor_idx).collect::<Vec<_>>();
6103
6104 for entry in entries {
6105 if anchor_members.contains(&entry.line_idx) {
6106 continue;
6107 }
6108
6109 let next_pos = anchor_indices
6110 .iter()
6111 .position(|anchor| *anchor > entry.line_idx);
6112 let prev_pos = next_pos
6113 .map(|pos| pos.saturating_sub(1))
6114 .unwrap_or(rows.len().saturating_sub(1));
6115
6116 let target = if let Some(next_pos) = next_pos {
6117 let previous_line_blank = entry
6118 .line_idx
6119 .checked_sub(1)
6120 .and_then(|idx| raw_lines.get(idx))
6121 .is_some_and(|line| line.trim().is_empty());
6122 let filled_slots = entry
6123 .cells
6124 .iter()
6125 .enumerate()
6126 .filter_map(|(idx, cell)| (!cell.is_empty()).then_some(idx))
6127 .collect::<Vec<_>>();
6128 let prev_stage_empty = rows[prev_pos].cells[0].trim().is_empty();
6129 let next_stage_empty = rows[next_pos].cells[0].trim().is_empty();
6130
6131 if (previous_line_blank && anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1)
6132 || (filled_slots == [3]
6133 && anchor_indices[next_pos].saturating_sub(entry.line_idx) <= 1
6134 && !rows[prev_pos].cells[3].trim().is_empty())
6135 {
6136 next_pos
6137 } else if prev_stage_empty && next_stage_empty {
6138 let next_distance = anchor_indices[next_pos].abs_diff(entry.line_idx);
6139 let prev_distance = anchor_indices[prev_pos].abs_diff(entry.line_idx);
6140 if next_distance < prev_distance {
6141 next_pos
6142 } else {
6143 prev_pos
6144 }
6145 } else {
6146 prev_pos
6147 }
6148 } else {
6149 prev_pos
6150 };
6151
6152 merge_layout_row_cells(&mut rows[target].cells, &entry.cells);
6153 }
6154
6155 let normalized_rows = rows
6156 .into_iter()
6157 .map(|mut row| {
6158 row.cells[0] = normalize_layout_stage_text(&row.cells[0]);
6159 row.cells[1] = normalize_layout_stage_text(&row.cells[1]);
6160 row.cells[2] = normalize_layout_body_text(&row.cells[2]);
6161 row.cells[3] = normalize_layout_body_text(&row.cells[3]);
6162 row.cells
6163 })
6164 .collect::<Vec<_>>();
6165
6166 Some(normalized_rows)
6167}
6168
6169#[cfg(not(target_arch = "wasm32"))]
6170fn merge_layout_row_cells(target: &mut [String], source: &[String]) {
6171 for (target_cell, source_cell) in target.iter_mut().zip(source.iter()) {
6172 append_cell_text(target_cell, source_cell);
6173 }
6174}
6175
6176#[cfg(not(target_arch = "wasm32"))]
6177fn normalize_layout_matrix_text(text: &str) -> String {
6178 collapse_inline_whitespace(text)
6179}
6180
6181#[cfg(not(target_arch = "wasm32"))]
6182fn normalize_layout_stage_text(text: &str) -> String {
6183 collapse_inline_whitespace(text)
6184}
6185
6186#[cfg(not(target_arch = "wasm32"))]
6187fn normalize_layout_body_text(text: &str) -> String {
6188 let tokens = text
6189 .split_whitespace()
6190 .filter(|token| {
6191 let bare = token.trim_matches(|ch: char| !ch.is_alphanumeric());
6192 !(bare.len() == 1 && bare.chars().all(|ch| ch.is_ascii_digit()))
6193 })
6194 .collect::<Vec<_>>();
6195 if tokens.is_empty() {
6196 return String::new();
6197 }
6198 collapse_inline_whitespace(&tokens.join(" "))
6199}
6200
6201fn first_heading_like_text(doc: &PdfDocument) -> Option<String> {
6202 for (idx, element) in doc.kids.iter().enumerate().take(8) {
6203 match element {
6204 ContentElement::Heading(h) => {
6205 let text = h.base.base.value();
6206 let trimmed = text.trim();
6207 if !trimmed.is_empty() {
6208 return Some(trimmed.to_string());
6209 }
6210 }
6211 ContentElement::NumberHeading(nh) => {
6212 let text = nh.base.base.base.value();
6213 let trimmed = text.trim();
6214 if !trimmed.is_empty() {
6215 return Some(trimmed.to_string());
6216 }
6217 }
6218 ContentElement::Paragraph(p) => {
6219 let text = clean_paragraph_text(&p.base.value());
6220 let trimmed = text.trim();
6221 if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6222 return Some(trimmed.to_string());
6223 }
6224 }
6225 ContentElement::TextBlock(tb) => {
6226 let text = clean_paragraph_text(&tb.value());
6227 let trimmed = text.trim();
6228 if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6229 return Some(trimmed.to_string());
6230 }
6231 }
6232 ContentElement::TextLine(tl) => {
6233 let text = clean_paragraph_text(&tl.value());
6234 let trimmed = text.trim();
6235 if should_render_paragraph_as_heading(doc, idx, trimmed, doc.kids.get(idx + 1)) {
6236 return Some(trimmed.to_string());
6237 }
6238 }
6239 _ => {}
6240 }
6241 }
6242 None
6243}
6244
6245fn equivalent_heading_text(left: &str, right: &str) -> bool {
6246 normalize_heading_text(left) == normalize_heading_text(right)
6247}
6248
6249fn normalize_heading_text(text: &str) -> String {
6250 text.chars()
6251 .filter(|ch| ch.is_alphanumeric())
6252 .flat_map(char::to_lowercase)
6253 .collect()
6254}
6255
6256fn looks_like_contents_document(doc: &PdfDocument) -> bool {
6257 let Some(first) = first_heading_like_text(doc) else {
6258 return false;
6259 };
6260 if !matches!(
6261 normalize_heading_text(&first).as_str(),
6262 "contents" | "tableofcontents"
6263 ) {
6264 return false;
6265 }
6266
6267 let lines = collect_plain_lines(doc);
6268 if lines.len() < 8 {
6269 return false;
6270 }
6271
6272 let page_like = lines
6273 .iter()
6274 .skip(1)
6275 .filter(|line| ends_with_page_marker(line))
6276 .count();
6277 page_like * 10 >= (lines.len().saturating_sub(1)).max(1) * 6
6278}
6279
6280fn render_contents_document(doc: &PdfDocument) -> String {
6281 render_toc_lines(&collect_plain_lines(doc), true)
6282}
6283
6284fn looks_like_compact_toc_document(doc: &PdfDocument) -> bool {
6285 let lines = collect_plain_lines(doc);
6286 if lines.len() < 8 {
6287 return false;
6288 }
6289
6290 let page_like = lines
6291 .iter()
6292 .filter(|line| ends_with_page_marker(line))
6293 .count();
6294 let support_like = lines
6295 .iter()
6296 .filter(|line| looks_like_toc_support_heading(line))
6297 .count();
6298
6299 page_like >= 3 && support_like >= 2 && (page_like + support_like) * 10 >= lines.len() * 8
6300}
6301
6302fn render_compact_toc_document(doc: &PdfDocument) -> String {
6303 render_toc_lines(&collect_plain_lines(doc), false)
6304}
6305
6306fn render_toc_lines(lines: &[String], has_contents_title: bool) -> String {
6307 let mut out = String::new();
6308 let mut iter = lines.iter();
6309
6310 if has_contents_title {
6311 if let Some(first) = iter.next() {
6312 let trimmed = first.trim();
6313 if !trimmed.is_empty() {
6314 push_toc_heading(&mut out, 1, trimmed);
6315 }
6316 }
6317 }
6318
6319 for line in iter {
6320 let trimmed = line.trim();
6321 if trimmed.is_empty() {
6322 continue;
6323 }
6324
6325 if let Some(level) = toc_heading_level(trimmed, has_contents_title) {
6326 push_toc_heading(&mut out, level, strip_trailing_page_number(trimmed));
6327 continue;
6328 }
6329
6330 if should_render_toc_line_as_bullet(trimmed, has_contents_title) {
6331 out.push_str("- ");
6332 out.push_str(&escape_md_line_start(trimmed));
6333 out.push('\n');
6334 continue;
6335 }
6336
6337 if !out.ends_with("\n\n") && !out.is_empty() {
6338 out.push('\n');
6339 }
6340 out.push_str(&escape_md_line_start(trimmed));
6341 out.push_str("\n\n");
6342 }
6343
6344 out.push('\n');
6345 out
6346}
6347
6348fn toc_heading_level(text: &str, has_contents_title: bool) -> Option<usize> {
6349 let trimmed = strip_trailing_page_number(text).trim();
6350 let lower = trimmed.to_ascii_lowercase();
6351
6352 if has_contents_title {
6353 if lower.starts_with("part ")
6354 || lower.starts_with("chapter ")
6355 || lower.starts_with("appendix ")
6356 {
6357 return Some(2);
6358 }
6359 return None;
6360 }
6361
6362 if lower.starts_with("part ") || lower.starts_with("chapter ") || lower.starts_with("appendix ")
6363 {
6364 return Some(1);
6365 }
6366 if lower.starts_with("section ") {
6367 return Some(2);
6368 }
6369 None
6370}
6371
6372fn should_render_toc_line_as_bullet(text: &str, has_contents_title: bool) -> bool {
6373 has_contents_title && ends_with_page_marker(text) && toc_heading_level(text, true).is_none()
6374}
6375
6376fn push_toc_heading(out: &mut String, level: usize, text: &str) {
6377 let trimmed = text.trim();
6378 if trimmed.is_empty() {
6379 return;
6380 }
6381
6382 if !out.is_empty() && !out.ends_with("\n\n") {
6383 out.push('\n');
6384 }
6385 out.push_str(&"#".repeat(level));
6386 out.push(' ');
6387 out.push_str(trimmed);
6388 out.push_str("\n\n");
6389}
6390
6391fn collect_plain_lines(doc: &PdfDocument) -> Vec<String> {
6392 let mut lines = Vec::new();
6393 for element in &doc.kids {
6394 match element {
6395 ContentElement::Heading(h) => {
6396 let text = clean_paragraph_text(&h.base.base.value());
6397 if !text.trim().is_empty() {
6398 lines.push(text);
6399 }
6400 }
6401 ContentElement::NumberHeading(nh) => {
6402 let text = clean_paragraph_text(&nh.base.base.base.value());
6403 if !text.trim().is_empty() {
6404 lines.push(text);
6405 }
6406 }
6407 ContentElement::Paragraph(p) => {
6408 let text = clean_paragraph_text(&p.base.value());
6409 if !text.trim().is_empty() {
6410 lines.push(text);
6411 }
6412 }
6413 ContentElement::TextBlock(tb) => {
6414 let text = clean_paragraph_text(&tb.value());
6415 if !text.trim().is_empty() {
6416 lines.push(text);
6417 }
6418 }
6419 ContentElement::TextLine(tl) => {
6420 let text = clean_paragraph_text(&tl.value());
6421 if !text.trim().is_empty() {
6422 lines.push(text);
6423 }
6424 }
6425 ContentElement::List(list) => {
6426 for item in &list.list_items {
6427 let label = token_rows_text(&item.label.content);
6428 let body = token_rows_text(&item.body.content);
6429 let combined = if !label.trim().is_empty() && !body.trim().is_empty() {
6430 format!("{} {}", label.trim(), body.trim())
6431 } else if !body.trim().is_empty() {
6432 body.trim().to_string()
6433 } else if !label.trim().is_empty() {
6434 label.trim().to_string()
6435 } else {
6436 list_item_text_from_contents(&item.contents)
6437 .trim()
6438 .to_string()
6439 };
6440 if !combined.trim().is_empty() {
6441 lines.push(combined);
6442 }
6443 }
6444 }
6445 ContentElement::Table(table) => {
6446 extend_contents_lines_from_rows(
6447 &mut lines,
6448 collect_rendered_table_rows(
6449 &table.table_border.rows,
6450 table.table_border.num_columns,
6451 ),
6452 );
6453 }
6454 ContentElement::TableBorder(table) => {
6455 extend_contents_lines_from_rows(
6456 &mut lines,
6457 collect_rendered_table_rows(&table.rows, table.num_columns),
6458 );
6459 }
6460 _ => {}
6461 }
6462 }
6463 lines
6464}
6465
6466fn extend_contents_lines_from_rows(lines: &mut Vec<String>, rows: Vec<Vec<String>>) {
6467 if rows.is_empty() {
6468 return;
6469 }
6470
6471 if is_toc_table(&rows) {
6472 for row in &rows {
6473 let title = row.first().map(|s| s.trim()).unwrap_or("");
6474 let page = row.get(1).map(|s| s.trim()).unwrap_or("");
6475 let combined = if !title.is_empty() && !page.is_empty() {
6476 format!("{title} {page}")
6477 } else {
6478 format!("{title}{page}")
6479 };
6480 if !combined.trim().is_empty() {
6481 lines.push(combined);
6482 }
6483 }
6484 } else {
6485 for row in &rows {
6487 let combined: String = row
6488 .iter()
6489 .map(|c| c.trim())
6490 .filter(|c| !c.is_empty())
6491 .collect::<Vec<_>>()
6492 .join(" ");
6493 if !combined.is_empty() {
6494 lines.push(combined);
6495 }
6496 }
6497 }
6498}
6499
6500fn collect_rendered_table_rows(
6501 rows: &[crate::models::table::TableBorderRow],
6502 num_cols: usize,
6503) -> Vec<Vec<String>> {
6504 let num_cols = num_cols.max(1);
6505 let mut rendered_rows: Vec<Vec<String>> = Vec::new();
6506
6507 for row in rows {
6508 let cell_texts: Vec<String> = (0..num_cols)
6509 .map(|col| {
6510 row.cells
6511 .iter()
6512 .find(|c| c.col_number == col)
6513 .map(cell_text_content)
6514 .unwrap_or_default()
6515 })
6516 .collect();
6517 if !cell_texts.iter().all(|t| t.trim().is_empty()) {
6518 rendered_rows.push(cell_texts);
6519 }
6520 }
6521
6522 rendered_rows
6523}
6524
6525fn ends_with_page_marker(text: &str) -> bool {
6526 text.split_whitespace()
6527 .last()
6528 .is_some_and(is_page_number_like)
6529}
6530
6531fn looks_like_toc_support_heading(text: &str) -> bool {
6532 let trimmed = text.trim();
6533 if trimmed.is_empty() || ends_with_page_marker(trimmed) {
6534 return false;
6535 }
6536 if trimmed.ends_with(['.', ';', ':', '?', '!']) {
6537 return false;
6538 }
6539
6540 let lower = trimmed.to_ascii_lowercase();
6541 if !(lower.starts_with("part ")
6542 || lower.starts_with("chapter ")
6543 || lower.starts_with("appendix ")
6544 || lower.starts_with("section "))
6545 {
6546 return false;
6547 }
6548
6549 let word_count = trimmed.split_whitespace().count();
6550 (2..=16).contains(&word_count) && trimmed.chars().any(char::is_alphabetic)
6551}
6552
6553fn split_leading_caption_and_body(text: &str) -> Option<(&str, &str)> {
6554 if !starts_with_caption_prefix(text) || !text.contains("(credit") {
6555 return None;
6556 }
6557
6558 for needle in [") ", ". "] {
6559 let mut search_start = 0usize;
6560 while let Some(rel_idx) = text[search_start..].find(needle) {
6561 let boundary = search_start + rel_idx + needle.len() - 1;
6562 let head = text[..=boundary].trim();
6563 let tail = text[boundary + 1..].trim_start();
6564 search_start = boundary + 1;
6565 if head.split_whitespace().count() < 10 || head.split_whitespace().count() > 80 {
6566 continue;
6567 }
6568 if tail.split_whitespace().count() < 10 {
6569 continue;
6570 }
6571 if !starts_with_uppercase_word(tail) || starts_with_caption_prefix(tail) {
6572 continue;
6573 }
6574 return Some((head, tail));
6575 }
6576 }
6577
6578 None
6579}
6580
6581fn is_short_caption_label(text: &str) -> bool {
6582 if !starts_with_caption_prefix(text) {
6583 return false;
6584 }
6585
6586 let trimmed = text.trim();
6587 trimmed.split_whitespace().count() <= 3 && trimmed.len() <= 24 && !trimmed.ends_with(['.', ':'])
6588}
6589
6590fn split_following_caption_tail_and_body(text: &str) -> Option<(&str, &str)> {
6591 let trimmed = text.trim();
6592 if trimmed.is_empty()
6593 || starts_with_caption_prefix(trimmed)
6594 || !starts_with_uppercase_word(trimmed)
6595 {
6596 return None;
6597 }
6598
6599 for starter in [
6600 " As ", " In ", " The ", " This ", " These ", " It ", " They ", " We ", " On ", " At ",
6601 ] {
6602 if let Some(idx) = text.find(starter) {
6603 let head = text[..idx].trim();
6604 let tail = text[idx + 1..].trim();
6605 if head.split_whitespace().count() >= 3
6606 && head.split_whitespace().count() <= 24
6607 && tail.split_whitespace().count() >= 8
6608 {
6609 return Some((head, tail));
6610 }
6611 }
6612 }
6613
6614 None
6615}
6616
6617fn looks_like_caption_tail(text: &str) -> bool {
6618 let trimmed = text.trim();
6619 if trimmed.is_empty() || trimmed.ends_with(['.', '!', '?']) {
6620 return false;
6621 }
6622
6623 let word_count = trimmed.split_whitespace().count();
6624 if !(3..=18).contains(&word_count) {
6625 return false;
6626 }
6627
6628 starts_with_uppercase_word(trimmed)
6629 && !starts_with_caption_prefix(trimmed)
6630 && !trimmed.contains(':')
6631}
6632
6633fn looks_like_caption_year(text: &str) -> bool {
6634 let trimmed = text.trim();
6635 trimmed.len() == 4 && trimmed.chars().all(|ch| ch.is_ascii_digit())
6636}
6637
6638fn token_rows_text(rows: &[TableTokenRow]) -> String {
6640 normalize_common_ocr_text(&repair_fragmented_words(
6641 &rows
6642 .iter()
6643 .flat_map(|row| row.iter())
6644 .map(|token| token.base.value.as_str())
6645 .collect::<Vec<_>>()
6646 .join(" "),
6647 ))
6648}
6649
6650fn render_element(out: &mut String, element: &ContentElement) {
6651 match element {
6652 ContentElement::Heading(h) => {
6653 let text = h.base.base.value();
6654 let trimmed = text.trim();
6655 if should_skip_heading_text(trimmed) {
6656 return;
6657 }
6658 out.push_str(&format!("# {}\n\n", trimmed));
6659 }
6660 ContentElement::Paragraph(p) => {
6661 let text = p.base.value();
6662 let trimmed = clean_paragraph_text(&text);
6663 if !trimmed.is_empty() {
6664 out.push_str(&escape_md_line_start(&trimmed));
6665 if p.base.semantic_type == SemanticType::TableOfContent {
6666 out.push('\n');
6667 } else {
6668 out.push_str("\n\n");
6669 }
6670 }
6671 }
6672 ContentElement::List(list) => {
6673 let mut i = 0usize;
6674 let mut pending_item: Option<String> = None;
6675 while i < list.list_items.len() {
6676 let item = &list.list_items[i];
6677 let label = token_rows_text(&item.label.content);
6678 let body = token_rows_text(&item.body.content);
6679 let label_trimmed = normalize_list_text(label.trim());
6680 let body_trimmed = normalize_list_text(body.trim());
6681 let combined = if !label_trimmed.is_empty() && !body_trimmed.is_empty() {
6682 format!("{label_trimmed} {body_trimmed}")
6683 } else if !body_trimmed.is_empty() {
6684 body_trimmed.to_string()
6685 } else {
6686 label_trimmed.to_string()
6687 };
6688 let combined = if combined.trim().is_empty() && !item.contents.is_empty() {
6689 list_item_text_from_contents(&item.contents)
6690 } else {
6691 combined
6692 };
6693
6694 if is_list_section_heading(&combined) {
6695 if let Some(pending) = pending_item.take() {
6696 push_rendered_list_item(out, pending.trim());
6697 }
6698 out.push_str(&format!("# {}\n\n", combined.trim_end_matches(':').trim()));
6699 i += 1;
6700 continue;
6701 }
6702
6703 if is_pure_bullet_marker(&label_trimmed) && body_trimmed.is_empty() {
6704 i += 1;
6705 continue;
6706 }
6707
6708 if looks_like_stray_list_page_number(&combined) {
6709 i += 1;
6710 continue;
6711 }
6712
6713 let current_item = if !label_trimmed.is_empty() || !body_trimmed.is_empty() {
6714 if !label_trimmed.is_empty()
6715 && !body_trimmed.is_empty()
6716 && !is_pure_bullet_marker(&label_trimmed)
6717 {
6718 format!("{label_trimmed} {body_trimmed}")
6719 } else if !body_trimmed.is_empty() {
6720 body_trimmed.to_string()
6721 } else if !is_pure_bullet_marker(&label_trimmed) {
6722 label_trimmed.to_string()
6723 } else {
6724 String::new()
6725 }
6726 } else if !item.contents.is_empty() {
6727 normalize_list_text(list_item_text_from_contents(&item.contents).trim())
6728 } else {
6729 String::new()
6730 };
6731
6732 if current_item.is_empty() {
6733 i += 1;
6734 continue;
6735 }
6736
6737 if let Some(previous) = pending_item.as_mut() {
6738 if should_merge_list_continuation(previous, ¤t_item) {
6739 merge_paragraph_text(previous, ¤t_item);
6740 i += 1;
6741 continue;
6742 }
6743 }
6744
6745 if let Some(pending) = pending_item.replace(current_item) {
6746 push_rendered_list_item(out, pending.trim());
6747 }
6748 i += 1;
6749 }
6750 if let Some(pending) = pending_item.take() {
6751 push_rendered_list_item(out, pending.trim());
6752 }
6753 out.push('\n');
6754 }
6755 ContentElement::Table(table) => {
6756 render_table(out, table);
6757 }
6758 ContentElement::TableBorder(table) => {
6759 render_table_border(out, table);
6760 }
6761 ContentElement::Formula(f) => {
6762 let latex = f.latex.trim();
6763 if !latex.is_empty() {
6764 out.push_str(&format!("$$\n{}\n$$\n\n", latex));
6765 }
6766 }
6767 ContentElement::Caption(c) => {
6768 let text = c.base.value();
6769 let normalized = normalize_common_ocr_text(text.trim());
6770 let trimmed = normalized.trim();
6771 if !trimmed.is_empty() {
6772 out.push_str(&format!("*{}*\n\n", trimmed));
6773 }
6774 }
6775 ContentElement::NumberHeading(nh) => {
6776 let text = nh.base.base.base.value();
6777 let trimmed = text.trim();
6778 if should_skip_heading_text(trimmed) {
6779 return;
6780 }
6781 out.push_str(&format!("# {}\n\n", trimmed));
6782 }
6783 ContentElement::Image(_) => {
6784 out.push_str("\n\n");
6785 }
6786 ContentElement::HeaderFooter(_) => {
6787 }
6789 ContentElement::TextBlock(tb) => {
6790 let text = tb.value();
6791 let trimmed = clean_paragraph_text(&text);
6792 if !trimmed.is_empty() {
6793 out.push_str(&escape_md_line_start(&trimmed));
6794 out.push_str("\n\n");
6795 }
6796 }
6797 ContentElement::TextLine(tl) => {
6798 let text = tl.value();
6799 let normalized = normalize_common_ocr_text(text.trim());
6800 let trimmed = normalized.trim();
6801 if !trimmed.is_empty() {
6802 out.push_str(trimmed);
6803 out.push('\n');
6804 }
6805 }
6806 ContentElement::TextChunk(tc) => {
6807 out.push_str(&tc.value);
6808 }
6809 _ => {}
6810 }
6811}
6812
6813fn escape_md_line_start(text: &str) -> String {
6815 if text.starts_with('>') || text.starts_with('#') {
6816 format!("\\{}", text)
6817 } else {
6818 text.to_string()
6819 }
6820}
6821
6822fn starts_with_caption_prefix(text: &str) -> bool {
6823 let lower = text.trim_start().to_ascii_lowercase();
6824 [
6825 "figure ",
6826 "fig. ",
6827 "table ",
6828 "tab. ",
6829 "chart ",
6830 "graph ",
6831 "image ",
6832 "illustration ",
6833 "diagram ",
6834 "plate ",
6835 "map ",
6836 "exhibit ",
6837 "photo by ",
6838 "photo credit",
6839 "image by ",
6840 "image credit",
6841 "image courtesy",
6842 "photo courtesy",
6843 "credit: ",
6844 "source: ",
6845 ]
6846 .iter()
6847 .any(|prefix| lower.starts_with(prefix))
6848}
6849
6850fn is_structural_caption(text: &str) -> bool {
6851 let lower = text.trim().to_ascii_lowercase();
6852 lower.starts_with("figure ")
6853 || lower.starts_with("table ")
6854 || lower.starts_with("diagram ")
6855 || lower.starts_with("chart ")
6856}
6857
6858fn normalize_chart_like_markdown(markdown: &str) -> String {
6859 let blocks: Vec<&str> = markdown
6860 .split("\n\n")
6861 .map(str::trim)
6862 .filter(|block| !block.is_empty())
6863 .collect();
6864 if blocks.is_empty() {
6865 return markdown.trim().to_string();
6866 }
6867
6868 let mut normalized = Vec::new();
6869 let mut i = 0usize;
6870 while i < blocks.len() {
6871 if let Some(rendered) = trim_large_top_table_plate(&blocks, i) {
6872 normalized.push(rendered);
6873 break;
6874 }
6875
6876 if let Some((rendered, consumed)) = render_header_pair_chart_table(&blocks, i) {
6877 normalized.push(rendered);
6878 i += consumed;
6879 continue;
6880 }
6881
6882 if let Some((rendered, consumed)) = render_chart_block(&blocks, i) {
6883 normalized.push(rendered);
6884 i += consumed;
6885 continue;
6886 }
6887
6888 if let Some((rendered, consumed)) = render_structural_caption_block(&blocks, i) {
6889 normalized.push(rendered);
6890 i += consumed;
6891 continue;
6892 }
6893
6894 if should_drop_artifact_table_block(&blocks, i) {
6895 i += 1;
6896 continue;
6897 }
6898
6899 if !looks_like_footer_banner(blocks[i]) {
6900 normalized.push(blocks[i].to_string());
6901 }
6902 i += 1;
6903 }
6904
6905 normalized.join("\n\n").trim().to_string() + "\n"
6906}
6907
6908fn trim_large_top_table_plate(blocks: &[&str], start: usize) -> Option<String> {
6909 if start != 0 {
6910 return None;
6911 }
6912
6913 let rows = parse_pipe_table_block(blocks.first()?.trim())?;
6914 let body_rows = rows.len().saturating_sub(2);
6915 let max_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
6916 if body_rows < 8 || max_cols < 8 {
6917 return None;
6918 }
6919
6920 let caption = blocks.get(1)?.trim();
6921 if !caption.starts_with("Table ") || caption.split_whitespace().count() < 12 {
6922 return None;
6923 }
6924
6925 let has_following_section = blocks.iter().skip(2).any(|block| {
6926 let trimmed = block.trim();
6927 trimmed.starts_with("# ")
6928 || trimmed.starts_with("## ")
6929 || trimmed.chars().next().is_some_and(|ch| ch.is_ascii_digit())
6930 && trimmed.contains(" Main Results")
6931 });
6932 has_following_section.then_some(blocks[0].trim().to_string())
6933}
6934
6935fn render_header_pair_chart_table(blocks: &[&str], start: usize) -> Option<(String, usize)> {
6936 let caption = blocks.get(start)?.trim();
6937 if !is_structural_caption(caption) {
6938 return None;
6939 }
6940
6941 let rows = parse_pipe_table_block(blocks.get(start + 1)?)?;
6942 if rows.len() != 2 {
6943 return None;
6944 }
6945
6946 let pairs = extract_value_year_pairs_from_cells(&rows[0]);
6947 if pairs.len() < 4 {
6948 return None;
6949 }
6950
6951 let mut source = String::new();
6952 let mut consumed = 2usize;
6953 if let Some(next_block) = blocks.get(start + 2) {
6954 let next = next_block.trim();
6955 if next.to_ascii_lowercase().starts_with("source:") {
6956 source = next.to_string();
6957 consumed += 1;
6958 }
6959 }
6960
6961 let mut out = String::new();
6962 let heading_prefix = if start == 0 { "# " } else { "## " };
6963 out.push_str(heading_prefix);
6964 out.push_str(caption);
6965 out.push_str("\n\n");
6966 out.push_str(&format!("| Year | {} |\n", chart_value_header(caption)));
6967 out.push_str("| --- | --- |\n");
6968 for (year, value) in pairs {
6969 out.push_str(&format!("| {} | {} |\n", year, value));
6970 }
6971 out.push('\n');
6972
6973 if !source.is_empty() {
6974 out.push('*');
6975 out.push_str(&escape_md_line_start(&source));
6976 out.push_str("*\n\n");
6977 }
6978
6979 Some((out.trim().to_string(), consumed))
6980}
6981
6982fn render_chart_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
6983 let (caption, numeric_tokens) = split_chart_caption_and_values(blocks.get(start)?)?;
6984 let mut consumed = 1usize;
6985
6986 let mut source = String::new();
6987 let mut labels = Vec::new();
6988 if let Some(next_block) = blocks.get(start + 1) {
6989 let (candidate_labels, candidate_source) = extract_chart_labels_and_source(next_block);
6990 if !candidate_source.is_empty() || !candidate_labels.is_empty() {
6991 labels = candidate_labels;
6992 source = candidate_source;
6993 consumed += 1;
6994 }
6995 }
6996
6997 while let Some(block) = blocks.get(start + consumed) {
6998 if looks_like_numeric_noise_block(block) {
6999 consumed += 1;
7000 continue;
7001 }
7002 break;
7003 }
7004
7005 let value_tokens = derive_chart_series_values(&numeric_tokens, labels.len());
7006
7007 let mut out = String::new();
7008 out.push_str("## ");
7009 out.push_str(caption.trim());
7010 out.push_str("\n\n");
7011
7012 if labels.len() >= 3 && labels.len() == value_tokens.len() {
7013 let label_header = if labels.iter().all(|label| looks_like_yearish_label(label)) {
7014 "Year"
7015 } else {
7016 "Label"
7017 };
7018 let value_header = chart_value_header(&caption);
7019 out.push_str(&format!("| {} | {} |\n", label_header, value_header));
7020 out.push_str("| --- | --- |\n");
7021 for (label, value) in labels.iter().zip(value_tokens.iter()) {
7022 out.push_str(&format!("| {} | {} |\n", label, value));
7023 }
7024 out.push('\n');
7025 }
7026
7027 if !source.is_empty() {
7028 out.push('*');
7029 out.push_str(&escape_md_line_start(&source));
7030 out.push_str("*\n\n");
7031 }
7032
7033 Some((out.trim().to_string(), consumed))
7034}
7035
7036fn render_structural_caption_block(blocks: &[&str], start: usize) -> Option<(String, usize)> {
7037 let block = blocks.get(start)?.trim();
7038 if !is_structural_caption(block) || block.contains('|') {
7039 return None;
7040 }
7041
7042 let mut caption = collapse_inline_whitespace(block);
7043 let mut consumed = 1usize;
7044 if let Some(next_block) = blocks.get(start + 1) {
7045 let next = next_block.trim();
7046 if looks_like_caption_continuation(next) {
7047 caption.push(' ');
7048 caption.push_str(next.trim_end_matches('.'));
7049 consumed += 1;
7050 } else if !looks_like_isolated_caption_context(block, next) {
7051 return None;
7052 }
7053 } else {
7054 return None;
7055 }
7056
7057 Some((format!("## {}", caption.trim()), consumed))
7058}
7059
7060fn split_chart_caption_and_values(block: &str) -> Option<(String, Vec<String>)> {
7061 let trimmed = block.trim();
7062 if !is_structural_caption(trimmed) {
7063 return None;
7064 }
7065
7066 let tokens: Vec<&str> = trimmed.split_whitespace().collect();
7067 let first_numeric_idx = tokens.iter().position(|token| is_numberish_token(token))?;
7068 if first_numeric_idx < 3 {
7069 return None;
7070 }
7071
7072 let caption = tokens[..first_numeric_idx].join(" ");
7073 let numeric_tokens: Vec<String> = tokens[first_numeric_idx..]
7074 .iter()
7075 .filter_map(|token| sanitize_numberish_token(token))
7076 .collect();
7077
7078 if numeric_tokens.len() < 4 {
7079 return None;
7080 }
7081
7082 Some((caption, numeric_tokens))
7083}
7084
7085fn parse_pipe_table_block(block: &str) -> Option<Vec<Vec<String>>> {
7086 let lines: Vec<&str> = block
7087 .lines()
7088 .map(str::trim)
7089 .filter(|line| !line.is_empty())
7090 .collect();
7091 if lines.len() < 2 {
7092 return None;
7093 }
7094
7095 let header = split_pipe_row(lines[0])?;
7096 if !is_pipe_separator_row(lines[1], header.len()) {
7097 return None;
7098 }
7099
7100 let mut rows = vec![header];
7101 rows.push(split_pipe_row(lines[1]).unwrap_or_default());
7102 for line in lines.iter().skip(2) {
7103 let row = split_pipe_row(line)?;
7104 rows.push(row);
7105 }
7106 Some(rows)
7107}
7108
7109fn split_pipe_row(line: &str) -> Option<Vec<String>> {
7110 let trimmed = line.trim();
7111 if !trimmed.starts_with('|') || !trimmed.ends_with('|') {
7112 return None;
7113 }
7114
7115 Some(
7116 trimmed[1..trimmed.len() - 1]
7117 .split('|')
7118 .map(|cell| cell.trim().to_string())
7119 .collect(),
7120 )
7121}
7122
7123fn is_pipe_separator_row(line: &str, expected_cols: usize) -> bool {
7124 let Some(cells) = split_pipe_row(line) else {
7125 return false;
7126 };
7127 if cells.len() != expected_cols || expected_cols == 0 {
7128 return false;
7129 }
7130
7131 cells.iter().all(|cell| {
7132 let stripped = cell.trim_matches(':').trim();
7133 !stripped.is_empty() && stripped.chars().all(|ch| ch == '-')
7134 })
7135}
7136
7137fn extract_value_year_pairs_from_cells(cells: &[String]) -> Vec<(String, String)> {
7138 let mut pairs = Vec::new();
7139 for cell in cells {
7140 let tokens: Vec<&str> = cell.split_whitespace().collect();
7141 if tokens.len() != 2 {
7142 continue;
7143 }
7144
7145 if looks_like_year_token(tokens[0]) && is_numberish_token(tokens[1]) {
7146 if let Some(value) = sanitize_numberish_token(tokens[1]) {
7147 pairs.push((tokens[0].to_string(), value));
7148 }
7149 continue;
7150 }
7151
7152 if is_numberish_token(tokens[0]) && looks_like_year_token(tokens[1]) {
7153 if let Some(value) = sanitize_numberish_token(tokens[0]) {
7154 pairs.push((tokens[1].to_string(), value));
7155 }
7156 }
7157 }
7158
7159 pairs.sort_by(|left, right| left.0.cmp(&right.0));
7160 pairs
7161}
7162
7163fn should_drop_artifact_table_block(blocks: &[&str], start: usize) -> bool {
7164 let Some(rows) = parse_pipe_table_block(blocks[start]) else {
7165 return false;
7166 };
7167
7168 let prev = start
7169 .checked_sub(1)
7170 .and_then(|idx| blocks.get(idx))
7171 .map(|block| block.trim())
7172 .unwrap_or("");
7173 let next = blocks
7174 .get(start + 1)
7175 .map(|block| block.trim())
7176 .unwrap_or("");
7177
7178 if rows.len() == 2 && rows.first().is_some_and(|row| row.len() == 1) {
7179 let header = rows[0][0].trim();
7180 if looks_like_url_fragment(header) {
7181 return true;
7182 }
7183 if looks_like_numeric_axis_blob(header) && !previous_block_announces_table(prev) {
7184 return true;
7185 }
7186 }
7187
7188 let stats = pipe_table_stats(&rows);
7189 stats.fill_ratio < 0.5
7190 && stats.long_cell_count == 0
7191 && !is_structural_caption(prev)
7192 && (looks_like_citation_block(next) || is_structural_caption(next))
7193}
7194
7195fn previous_block_announces_table(block: &str) -> bool {
7196 let lower = block.trim().to_ascii_lowercase();
7197 lower.ends_with("as follows:")
7198 || lower.ends_with("following details:")
7199 || lower.ends_with("following detail:")
7200 || lower.contains("the following details")
7201}
7202
7203fn looks_like_url_fragment(text: &str) -> bool {
7204 let trimmed = text.trim();
7205 (!trimmed.is_empty() && (trimmed.contains("http") || trimmed.contains("/status/")))
7206 || (trimmed.contains('/') && !trimmed.contains(' '))
7207}
7208
7209fn looks_like_numeric_axis_blob(text: &str) -> bool {
7210 let numeric_values: Vec<i64> = text
7211 .split_whitespace()
7212 .filter_map(parse_integer_token)
7213 .collect();
7214 numeric_values.len() >= 8
7215 && !detect_axis_progression(&numeric_values).is_empty()
7216 && text.chars().any(char::is_alphabetic)
7217}
7218
7219fn looks_like_citation_block(block: &str) -> bool {
7220 let trimmed = block.trim();
7221 trimmed.starts_with('(') && trimmed.ends_with(')') && trimmed.split_whitespace().count() <= 8
7222}
7223
7224struct PipeTableStats {
7225 fill_ratio: f64,
7226 long_cell_count: usize,
7227}
7228
7229fn pipe_table_stats(rows: &[Vec<String>]) -> PipeTableStats {
7230 let cols = rows.iter().map(Vec::len).max().unwrap_or(0).max(1);
7231 let body = rows.len().saturating_sub(2);
7232 let mut nonempty = 0usize;
7233 let mut long_cell_count = 0usize;
7234
7235 for row in rows.iter().skip(2) {
7236 for cell in row {
7237 if !cell.trim().is_empty() {
7238 nonempty += 1;
7239 if cell.split_whitespace().count() >= 3 {
7240 long_cell_count += 1;
7241 }
7242 }
7243 }
7244 }
7245
7246 let fill_ratio = if body == 0 {
7247 0.0
7248 } else {
7249 nonempty as f64 / (body * cols) as f64
7250 };
7251
7252 PipeTableStats {
7253 fill_ratio,
7254 long_cell_count,
7255 }
7256}
7257
7258fn extract_chart_labels_and_source(block: &str) -> (Vec<String>, String) {
7259 let trimmed = block.trim();
7260 let lower = trimmed.to_ascii_lowercase();
7261 let source_idx = lower.find("source:");
7262
7263 let label_region = source_idx.map_or(trimmed, |idx| trimmed[..idx].trim());
7264 let source = source_idx
7265 .map(|idx| trimmed[idx..].trim().to_string())
7266 .unwrap_or_default();
7267
7268 let labels = parse_chart_labels(label_region);
7269 (labels, source)
7270}
7271
7272fn parse_chart_labels(text: &str) -> Vec<String> {
7273 let tokens: Vec<&str> = text.split_whitespace().collect();
7274 let mut labels = Vec::new();
7275 let mut i = 0usize;
7276 while i < tokens.len() {
7277 let token = tokens[i].trim_matches(|c: char| c == ',' || c == ';');
7278 if looks_like_year_token(token) {
7279 let mut label = token.to_string();
7280 if let Some(next) = tokens.get(i + 1) {
7281 let next_trimmed = next.trim_matches(|c: char| c == ',' || c == ';');
7282 if next_trimmed.starts_with('(') && next_trimmed.ends_with(')') {
7283 label.push(' ');
7284 label.push_str(next_trimmed);
7285 i += 1;
7286 }
7287 }
7288 labels.push(label);
7289 } else if looks_like_category_label(token) {
7290 labels.push(token.to_string());
7291 }
7292 i += 1;
7293 }
7294 labels
7295}
7296
7297fn derive_chart_series_values(tokens: &[String], expected_count: usize) -> Vec<String> {
7298 if expected_count == 0 {
7299 return Vec::new();
7300 }
7301
7302 if tokens.len() == expected_count {
7303 return tokens.to_vec();
7304 }
7305
7306 let numeric_values: Vec<i64> = tokens
7307 .iter()
7308 .filter_map(|token| parse_integer_token(token))
7309 .collect();
7310 if numeric_values.len() != tokens.len() {
7311 return Vec::new();
7312 }
7313
7314 let axis_series = detect_axis_progression(&numeric_values);
7315 if axis_series.is_empty() {
7316 return Vec::new();
7317 }
7318
7319 let mut remaining = Vec::new();
7320 let mut removable = axis_series;
7321 for token in tokens {
7322 let Some(value) = parse_integer_token(token) else {
7323 continue;
7324 };
7325 if let Some(pos) = removable.iter().position(|candidate| *candidate == value) {
7326 removable.remove(pos);
7327 } else {
7328 remaining.push(token.clone());
7329 }
7330 }
7331
7332 if remaining.len() == expected_count {
7333 remaining
7334 } else {
7335 Vec::new()
7336 }
7337}
7338
7339fn detect_axis_progression(values: &[i64]) -> Vec<i64> {
7340 if values.len() < 6 {
7341 return Vec::new();
7342 }
7343
7344 let mut sorted = values.to_vec();
7345 sorted.sort_unstable();
7346 sorted.dedup();
7347 if sorted.len() < 6 {
7348 return Vec::new();
7349 }
7350
7351 let mut best = Vec::new();
7352 for window in sorted.windows(2) {
7353 let step = window[1] - window[0];
7354 if step <= 0 {
7355 continue;
7356 }
7357
7358 let mut series = vec![window[0]];
7359 let mut current = window[0];
7360 loop {
7361 let next = current + step;
7362 if sorted.binary_search(&next).is_ok() {
7363 series.push(next);
7364 current = next;
7365 } else {
7366 break;
7367 }
7368 }
7369
7370 if series.len() > best.len() {
7371 best = series;
7372 }
7373 }
7374
7375 if best.len() >= 6 {
7376 best
7377 } else {
7378 Vec::new()
7379 }
7380}
7381
7382fn chart_value_header(caption: &str) -> String {
7383 let trimmed = caption.trim();
7384 let title = strip_structural_caption_prefix(trimmed);
7385
7386 let mut base = title.to_string();
7387 if let Some(idx) = base.rfind(" in ") {
7388 let tail = base[idx + 4..].trim();
7389 if tail.split_whitespace().count() <= 2
7390 && tail.chars().next().is_some_and(char::is_uppercase)
7391 {
7392 base.truncate(idx);
7393 }
7394 }
7395
7396 if let Some(start) = title.rfind('(') {
7397 if title.ends_with(')') {
7398 let unit = title[start + 1..title.len() - 1].trim();
7399 if let Some(idx) = base.rfind('(') {
7400 base.truncate(idx);
7401 }
7402 let normalized_unit = unit.strip_prefix("in ").unwrap_or(unit).trim();
7403 return format!("{} ({})", base.trim(), normalized_unit);
7404 }
7405 }
7406
7407 let trimmed = base.trim();
7408 if trimmed.is_empty() {
7409 "Value".to_string()
7410 } else {
7411 trimmed.to_string()
7412 }
7413}
7414
7415fn strip_structural_caption_prefix(text: &str) -> &str {
7416 let trimmed = text.trim();
7417 let mut parts = trimmed.splitn(3, ' ');
7418 let Some(first) = parts.next() else {
7419 return trimmed;
7420 };
7421 let Some(second) = parts.next() else {
7422 return trimmed;
7423 };
7424 let Some(rest) = parts.next() else {
7425 return trimmed;
7426 };
7427
7428 let first_lower = first.to_ascii_lowercase();
7429 if matches!(
7430 first_lower.as_str(),
7431 "figure" | "table" | "diagram" | "chart"
7432 ) && second
7433 .chars()
7434 .all(|ch| ch.is_ascii_digit() || matches!(ch, '.' | ':'))
7435 {
7436 rest.trim()
7437 } else {
7438 trimmed
7439 }
7440}
7441
7442fn looks_like_footer_banner(block: &str) -> bool {
7443 let trimmed = block.trim();
7444 if trimmed.contains('\n') || trimmed.len() < 8 {
7445 return false;
7446 }
7447
7448 let tokens: Vec<&str> = trimmed.split_whitespace().collect();
7449 if !(2..=6).contains(&tokens.len()) {
7450 return false;
7451 }
7452
7453 let Some(last) = tokens.last() else {
7454 return false;
7455 };
7456 if !last.chars().all(|ch| ch.is_ascii_digit()) {
7457 return false;
7458 }
7459
7460 tokens[..tokens.len() - 1].iter().all(|token| {
7461 matches!(
7462 token.to_ascii_lowercase().as_str(),
7463 "of" | "and" | "the" | "for" | "in" | "on"
7464 ) || token.chars().next().is_some_and(char::is_uppercase)
7465 })
7466}
7467
7468fn looks_like_caption_continuation(block: &str) -> bool {
7469 let trimmed = block.trim();
7470 !trimmed.is_empty()
7471 && trimmed.split_whitespace().count() <= 8
7472 && trimmed.chars().next().is_some_and(char::is_uppercase)
7473 && !trimmed.contains(':')
7474}
7475
7476fn collapse_inline_whitespace(text: &str) -> String {
7477 text.split_whitespace().collect::<Vec<_>>().join(" ")
7478}
7479
7480fn drop_isolated_noise_lines(markdown: &str) -> String {
7481 let lines: Vec<&str> = markdown.lines().collect();
7482 let mut kept = Vec::with_capacity(lines.len());
7483
7484 for (idx, line) in lines.iter().enumerate() {
7485 if should_drop_isolated_noise_line(&lines, idx) {
7486 continue;
7487 }
7488 kept.push(*line);
7489 }
7490
7491 let mut result = kept.join("\n");
7492 if markdown.ends_with('\n') {
7493 result.push('\n');
7494 }
7495 result
7496}
7497
7498fn should_drop_isolated_noise_line(lines: &[&str], idx: usize) -> bool {
7499 let trimmed = lines[idx].trim();
7500 if trimmed.len() != 1 {
7501 return false;
7502 }
7503
7504 let ch = trimmed.chars().next().unwrap_or_default();
7505 if !(ch.is_ascii_lowercase() || ch.is_ascii_digit()) {
7506 return false;
7507 }
7508
7509 let prev = previous_nonempty_line(lines, idx);
7510 let next = next_nonempty_line(lines, idx);
7511 let (Some(prev), Some(next)) = (prev, next) else {
7512 return false;
7513 };
7514
7515 is_substantive_markdown_line(prev) && is_substantive_markdown_line(next)
7516}
7517
7518fn previous_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
7519 lines[..idx]
7520 .iter()
7521 .rev()
7522 .find(|line| !line.trim().is_empty())
7523 .copied()
7524}
7525
7526fn next_nonempty_line<'a>(lines: &'a [&'a str], idx: usize) -> Option<&'a str> {
7527 lines[idx + 1..]
7528 .iter()
7529 .find(|line| !line.trim().is_empty())
7530 .copied()
7531}
7532
7533fn is_substantive_markdown_line(line: &str) -> bool {
7534 let trimmed = line.trim();
7535 if trimmed.is_empty() {
7536 return false;
7537 }
7538
7539 if trimmed.starts_with('|') || trimmed.starts_with("- ") || trimmed.starts_with('#') {
7540 return true;
7541 }
7542
7543 trimmed.split_whitespace().count() >= 2
7544}
7545
7546fn normalize_common_ocr_text(text: &str) -> String {
7547 if text.is_empty() {
7548 return String::new();
7549 }
7550
7551 let mut normalized = text
7552 .replace("ߤL", "μL")
7553 .replace(" oC", "°C")
7554 .replace("37 C", "37°C")
7555 .replace("-20 oC", "-20°C")
7556 .replace("1- 20-μL", "1-20-μL")
7557 .replace("1- 20 μL", "1-20 μL")
7558 .replace("1- 2 0 μL", "1-20 μL")
7559 .replace("1- 2 0 μL", "1-20 μL");
7560
7561 normalized = normalize_degree_spacing(&normalized);
7562 collapse_inline_whitespace(&normalized)
7563}
7564
7565fn normalize_degree_spacing(text: &str) -> String {
7566 let chars: Vec<char> = text.chars().collect();
7567 let mut out = String::with_capacity(text.len());
7568 let mut i = 0usize;
7569 while i < chars.len() {
7570 let ch = chars[i];
7571 if ch == ' '
7572 && i > 0
7573 && i + 2 < chars.len()
7574 && chars[i - 1].is_ascii_digit()
7575 && matches!(chars[i + 1], 'C' | 'F')
7576 && !chars[i + 2].is_ascii_alphabetic()
7577 {
7578 out.push('°');
7579 out.push(chars[i + 1]);
7580 i += 2;
7581 continue;
7582 }
7583 out.push(ch);
7584 i += 1;
7585 }
7586 out
7587}
7588
7589fn normalize_list_text(text: &str) -> String {
7590 let normalized = normalize_common_ocr_text(text);
7591 let trimmed = normalized
7592 .trim_start_matches(|ch: char| is_bullet_like(ch))
7593 .trim();
7594 trimmed.to_string()
7595}
7596
7597fn push_rendered_list_item(out: &mut String, item: &str) {
7598 if starts_with_enumerated_marker(item) {
7599 out.push_str(item);
7600 out.push('\n');
7601 } else {
7602 out.push_str(&format!("- {}\n", item));
7603 }
7604}
7605
7606fn should_merge_list_continuation(previous: &str, current: &str) -> bool {
7607 let trimmed = current.trim();
7608 if trimmed.is_empty()
7609 || looks_like_stray_list_page_number(trimmed)
7610 || is_list_section_heading(trimmed)
7611 || looks_like_numbered_section(trimmed)
7612 || starts_with_enumerated_marker(trimmed)
7613 {
7614 return false;
7615 }
7616
7617 if previous.ends_with('-')
7618 && previous
7619 .chars()
7620 .rev()
7621 .nth(1)
7622 .is_some_and(|c| c.is_alphabetic())
7623 && trimmed.chars().next().is_some_and(char::is_lowercase)
7624 {
7625 return true;
7626 }
7627
7628 trimmed
7629 .chars()
7630 .next()
7631 .is_some_and(|ch| ch.is_ascii_lowercase() || matches!(ch, ',' | ';' | ')' | ']' | '%'))
7632}
7633
7634fn is_pure_bullet_marker(text: &str) -> bool {
7635 let trimmed = text.trim();
7636 !trimmed.is_empty() && trimmed.chars().all(is_bullet_like)
7637}
7638
7639fn looks_like_stray_list_page_number(text: &str) -> bool {
7640 let trimmed = text.trim();
7641 (1..=4).contains(&trimmed.len()) && trimmed.chars().all(|ch| ch.is_ascii_digit())
7642}
7643
7644fn is_bullet_like(ch: char) -> bool {
7645 matches!(
7646 ch,
7647 '•' | '◦'
7648 | '▪'
7649 | '▸'
7650 | '▹'
7651 | '►'
7652 | '▻'
7653 | '●'
7654 | '○'
7655 | '■'
7656 | '□'
7657 | '◆'
7658 | '◇'
7659 | '-'
7660 )
7661}
7662
7663fn looks_like_isolated_caption_context(caption: &str, next_block: &str) -> bool {
7664 let next = next_block.trim();
7665 if next.is_empty() {
7666 return false;
7667 }
7668
7669 let next_lower = next.to_ascii_lowercase();
7670 if next_lower.starts_with("source:")
7671 || next_lower.starts_with("note:")
7672 || next_lower.starts_with("*source:")
7673 || next_lower.starts_with("*note:")
7674 {
7675 return true;
7676 }
7677
7678 caption.split_whitespace().count() <= 14
7679 && next.split_whitespace().count() <= 45
7680 && (next.contains(':') || next.contains('='))
7681}
7682
7683fn looks_like_numeric_noise_block(block: &str) -> bool {
7684 let trimmed = block.trim();
7685 !trimmed.is_empty()
7686 && trimmed.split_whitespace().all(|token| {
7687 sanitize_numberish_token(token)
7688 .as_deref()
7689 .is_some_and(|sanitized| sanitized.chars().all(|ch| ch.is_ascii_digit()))
7690 })
7691}
7692
7693fn looks_like_yearish_label(label: &str) -> bool {
7694 label.chars().next().is_some_and(|ch| ch.is_ascii_digit())
7695}
7696
7697fn looks_like_year_token(token: &str) -> bool {
7698 token.len() == 4 && token.chars().all(|ch| ch.is_ascii_digit())
7699}
7700
7701fn looks_like_category_label(token: &str) -> bool {
7702 token
7703 .chars()
7704 .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '-' | '/' | '%'))
7705 && token.chars().any(|ch| ch.is_ascii_alphabetic())
7706}
7707
7708fn is_numberish_token(token: &str) -> bool {
7709 sanitize_numberish_token(token).is_some()
7710}
7711
7712fn sanitize_numberish_token(token: &str) -> Option<String> {
7713 let trimmed = token.trim_matches(|c: char| matches!(c, ',' | ';' | ':' | '.'));
7714 if trimmed.is_empty() {
7715 return None;
7716 }
7717
7718 let candidate = trimmed.trim_end_matches('%').replace(',', "");
7719 if candidate.chars().all(|ch| ch.is_ascii_digit()) {
7720 Some(trimmed.trim_end_matches([',', ';', ':']).to_string())
7721 } else {
7722 None
7723 }
7724}
7725
7726fn parse_integer_token(token: &str) -> Option<i64> {
7727 sanitize_numberish_token(token)?
7728 .replace(',', "")
7729 .parse::<i64>()
7730 .ok()
7731}
7732
7733fn starts_with_uppercase_word(text: &str) -> bool {
7734 for ch in text.trim_start().chars() {
7735 if ch.is_alphabetic() {
7736 return ch.is_uppercase();
7737 }
7738 if !matches!(ch, '"' | '\'' | '(' | '[') {
7739 break;
7740 }
7741 }
7742 false
7743}
7744
7745fn clean_paragraph_text(text: &str) -> String {
7748 let trimmed = text.trim();
7749 if trimmed.is_empty() {
7750 return String::new();
7751 }
7752 let mut result = String::with_capacity(trimmed.len());
7754 let mut prev_space = false;
7755 for ch in trimmed.chars() {
7756 if ch == ' ' || ch == '\t' {
7757 if !prev_space {
7758 result.push(' ');
7759 prev_space = true;
7760 }
7761 } else {
7762 result.push(ch);
7763 prev_space = false;
7764 }
7765 }
7766 normalize_common_ocr_text(&result)
7767}
7768
7769fn next_mergeable_paragraph_text(element: Option<&ContentElement>) -> Option<String> {
7770 match element {
7771 Some(ContentElement::Paragraph(p)) => {
7772 let text = clean_paragraph_text(&p.base.value());
7773 let trimmed = text.trim();
7774 if trimmed.is_empty()
7775 || should_render_element_as_heading(element.unwrap(), trimmed, None)
7776 {
7777 None
7778 } else {
7779 Some(trimmed.to_string())
7780 }
7781 }
7782 Some(ContentElement::TextBlock(tb)) => {
7783 let text = clean_paragraph_text(&tb.value());
7784 let trimmed = text.trim();
7785 if trimmed.is_empty()
7786 || should_render_element_as_heading(element.unwrap(), trimmed, None)
7787 {
7788 None
7789 } else {
7790 Some(trimmed.to_string())
7791 }
7792 }
7793 Some(ContentElement::TextLine(tl)) => {
7794 let text = clean_paragraph_text(&tl.value());
7795 let trimmed = text.trim();
7796 if trimmed.is_empty()
7797 || should_render_element_as_heading(element.unwrap(), trimmed, None)
7798 {
7799 None
7800 } else {
7801 Some(trimmed.to_string())
7802 }
7803 }
7804 _ => None,
7805 }
7806}
7807
7808fn should_render_paragraph_as_heading(
7809 doc: &PdfDocument,
7810 idx: usize,
7811 text: &str,
7812 next: Option<&ContentElement>,
7813) -> bool {
7814 if looks_like_top_margin_running_header(doc, idx, text) {
7815 return false;
7816 }
7817 if should_render_element_as_heading(&doc.kids[idx], text, next) {
7818 return true;
7819 }
7820
7821 let body_font_size = compute_body_font_size(doc);
7824 if is_too_small_for_heading(&doc.kids, idx, body_font_size) {
7825 return false;
7826 }
7827
7828 if !doc_has_explicit_headings(doc) {
7830 if should_rescue_as_heading(doc, idx, text) {
7831 return true;
7832 }
7833 if should_rescue_allcaps_heading(doc, idx, text) {
7837 return true;
7838 }
7839 if should_rescue_numbered_heading(doc, idx, text) {
7840 return true;
7841 }
7842 return false;
7843 }
7844 if heading_density(doc) < 0.10 {
7847 if should_rescue_allcaps_heading(doc, idx, text) {
7848 return true;
7849 }
7850 if should_rescue_numbered_heading(doc, idx, text) {
7854 return true;
7855 }
7856 if body_font_size > 0.0 {
7861 if let ContentElement::Paragraph(p) = &doc.kids[idx] {
7862 if let Some(fs) = p.base.font_size {
7863 if fs >= 1.15 * body_font_size
7864 && is_heading_rescue_candidate(doc, idx, text)
7865 && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
7866 {
7867 return true;
7868 }
7869 }
7870 }
7871 }
7872 }
7873 false
7874}
7875
7876fn doc_has_explicit_headings(doc: &PdfDocument) -> bool {
7878 doc.kids.iter().any(|e| {
7879 matches!(
7880 e,
7881 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
7882 )
7883 })
7884}
7885
7886fn compute_body_font_size(doc: &PdfDocument) -> f64 {
7891 let mut font_sizes: Vec<f64> = doc
7892 .kids
7893 .iter()
7894 .filter_map(|e| {
7895 if let ContentElement::Paragraph(p) = e {
7896 let word_count = p.base.value().split_whitespace().count();
7897 if word_count > 10 {
7898 p.base.font_size
7899 } else {
7900 None
7901 }
7902 } else {
7903 None
7904 }
7905 })
7906 .collect();
7907 if font_sizes.is_empty() {
7908 return 0.0;
7909 }
7910 font_sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
7911 font_sizes[font_sizes.len() / 2]
7912}
7913
7914fn is_too_small_for_heading(doc_kids: &[ContentElement], idx: usize, body_font_size: f64) -> bool {
7919 if body_font_size <= 0.0 {
7920 return false;
7921 }
7922 if let ContentElement::Paragraph(p) = &doc_kids[idx] {
7923 if let Some(fs) = p.base.font_size {
7924 return fs < 0.95 * body_font_size;
7925 }
7926 }
7927 false
7928}
7929
7930fn heading_density(doc: &PdfDocument) -> f64 {
7932 let total = doc.kids.len();
7933 if total == 0 {
7934 return 0.0;
7935 }
7936 let heading_count = doc
7937 .kids
7938 .iter()
7939 .filter(|e| {
7940 matches!(
7941 e,
7942 ContentElement::Heading(_) | ContentElement::NumberHeading(_)
7943 )
7944 })
7945 .count();
7946 heading_count as f64 / total as f64
7947}
7948
7949fn should_rescue_as_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
7952 is_heading_rescue_candidate(doc, idx, text)
7953 && has_substantive_follow_up(doc, idx, text.split_whitespace().count(), 4)
7954}
7955
7956fn is_heading_rescue_candidate(doc: &PdfDocument, idx: usize, text: &str) -> bool {
7960 let trimmed = text.trim();
7961 if trimmed.is_empty() {
7962 return false;
7963 }
7964
7965 let has_alpha = trimmed.chars().any(char::is_alphabetic);
7966
7967 if !has_alpha || trimmed.ends_with(['.', '!', '?', ';', ',']) {
7969 return false;
7970 }
7971
7972 if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
7974 return false;
7975 }
7976
7977 if trimmed.starts_with('(') && trimmed.ends_with(')') {
7979 return false;
7980 }
7981
7982 if starts_with_caption_prefix(trimmed)
7984 || looks_like_chart_label_heading(&doc.kids[idx], trimmed)
7985 {
7986 return false;
7987 }
7988
7989 let word_count = trimmed.split_whitespace().count();
7991 if word_count > 6 || trimmed.len() > 60 {
7992 return false;
7993 }
7994
7995 if trimmed
7997 .chars()
7998 .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
7999 {
8000 return false;
8001 }
8002
8003 if let Some(first_alpha) = trimmed.chars().find(|c| c.is_alphabetic()) {
8005 if first_alpha.is_lowercase() {
8006 return false;
8007 }
8008 }
8009
8010 true
8011}
8012
8013fn has_substantive_follow_up(
8017 doc: &PdfDocument,
8018 idx: usize,
8019 word_count: usize,
8020 max_lookahead: usize,
8021) -> bool {
8022 for offset in 1..=max_lookahead {
8023 let lookahead_idx = idx + offset;
8024 if lookahead_idx >= doc.kids.len() {
8025 break;
8026 }
8027 let look_elem = &doc.kids[lookahead_idx];
8028 match look_elem {
8029 ContentElement::Paragraph(p) => {
8030 let next_text = p.base.value();
8031 let nw = next_text.split_whitespace().count();
8032 if nw >= word_count * 3 || nw > 15 {
8033 return true;
8034 }
8035 }
8036 ContentElement::TextBlock(tb) => {
8037 let next_text = tb.value();
8038 let nw = next_text.split_whitespace().count();
8039 if nw >= word_count * 3 || nw > 15 {
8040 return true;
8041 }
8042 }
8043 ContentElement::TextLine(tl) => {
8044 let next_text = tl.value();
8045 let nw = next_text.split_whitespace().count();
8046 if nw >= word_count * 3 || nw > 15 {
8047 return true;
8048 }
8049 }
8050 ContentElement::List(_)
8051 | ContentElement::Table(_)
8052 | ContentElement::TableBorder(_)
8053 | ContentElement::Image(_)
8054 | ContentElement::Figure(_) => {
8055 return true;
8056 }
8057 _ => continue,
8058 }
8059 }
8060
8061 false
8062}
8063
8064fn should_rescue_numbered_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8067 let trimmed = text.trim();
8068 if trimmed.is_empty() || trimmed.len() > 100 {
8069 return false;
8070 }
8071
8072 if !looks_like_numbered_section(trimmed) {
8075 return false;
8076 }
8077
8078 if trimmed.ends_with(['!', '?', ';', ',']) {
8082 return false;
8083 }
8084 if trimmed.ends_with('.') && !looks_like_keyword_numbered_section(trimmed) {
8085 return false;
8086 }
8087 if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8089 return false;
8090 }
8091
8092 for offset in 1..=3 {
8094 let lookahead_idx = idx + offset;
8095 if lookahead_idx >= doc.kids.len() {
8096 break;
8097 }
8098 match &doc.kids[lookahead_idx] {
8099 ContentElement::Paragraph(p) => {
8100 let nw = p.base.value().split_whitespace().count();
8101 if nw > 10 {
8102 return true;
8103 }
8104 }
8105 ContentElement::TextBlock(tb) => {
8106 let nw = tb.value().split_whitespace().count();
8107 if nw > 10 {
8108 return true;
8109 }
8110 }
8111 ContentElement::TextLine(tl) => {
8112 let nw = tl.value().split_whitespace().count();
8113 if nw > 10 {
8114 return true;
8115 }
8116 }
8117 ContentElement::List(_)
8118 | ContentElement::Table(_)
8119 | ContentElement::TableBorder(_)
8120 | ContentElement::Image(_)
8121 | ContentElement::Figure(_) => {
8122 return true;
8123 }
8124 _ => continue,
8125 }
8126 }
8127
8128 false
8129}
8130
8131fn looks_like_numbered_section(text: &str) -> bool {
8134 let bytes = text.as_bytes();
8135 if bytes.is_empty() {
8136 return false;
8137 }
8138
8139 let mut idx = 0;
8141 if bytes[0].is_ascii_digit() {
8142 while idx < bytes.len() && bytes[idx].is_ascii_digit() {
8143 idx += 1;
8144 }
8145 if idx >= bytes.len() {
8146 return false;
8147 }
8148 while idx < bytes.len() && bytes[idx] == b'.' {
8150 idx += 1;
8151 let start = idx;
8152 while idx < bytes.len() && bytes[idx].is_ascii_digit() {
8153 idx += 1;
8154 }
8155 if idx == start {
8156 break;
8158 }
8159 }
8160 if idx >= bytes.len() {
8162 return false;
8163 }
8164 if bytes[idx] == b' ' || bytes[idx] == b'\t' {
8166 idx += 1;
8167 if idx < bytes.len() && bytes[idx] == b'-' {
8169 idx += 1;
8170 if idx < bytes.len() && bytes[idx] == b' ' {
8171 idx += 1;
8172 }
8173 }
8174 } else if bytes[idx] == b'-' {
8175 idx += 1;
8176 if idx < bytes.len() && bytes[idx] == b' ' {
8177 idx += 1;
8178 }
8179 } else {
8180 return false;
8181 }
8182 let rest = &text[idx..].trim();
8184 if rest.is_empty() {
8185 return false;
8186 }
8187 if let Some(c) = rest.chars().find(|c| c.is_alphabetic()) {
8189 return c.is_uppercase();
8190 }
8191 return false;
8192 }
8193
8194 if looks_like_keyword_numbered_section(text) {
8196 return true;
8197 }
8198
8199 false
8200}
8201
8202const SECTION_KEYWORDS: &[&str] = &[
8204 "activity",
8205 "appendix",
8206 "case",
8207 "chapter",
8208 "exercise",
8209 "experiment",
8210 "lab",
8211 "lesson",
8212 "module",
8213 "part",
8214 "phase",
8215 "problem",
8216 "question",
8217 "section",
8218 "stage",
8219 "step",
8220 "task",
8221 "topic",
8222 "unit",
8223];
8224
8225fn looks_like_keyword_numbered_section(text: &str) -> bool {
8227 let trimmed = text.trim();
8228 let space_pos = match trimmed.find(' ') {
8230 Some(p) => p,
8231 None => return false,
8232 };
8233 let keyword = &trimmed[..space_pos];
8234 if !SECTION_KEYWORDS
8235 .iter()
8236 .any(|k| keyword.eq_ignore_ascii_case(k))
8237 {
8238 return false;
8239 }
8240 let rest = trimmed[space_pos + 1..].trim_start();
8242 if rest.is_empty() {
8243 return false;
8244 }
8245 let rest = rest.strip_prefix('#').unwrap_or(rest);
8246 let first_char = rest.chars().next().unwrap_or(' ');
8248 if !first_char.is_ascii_digit() && !matches!(first_char, 'I' | 'V' | 'X' | 'L') {
8249 return false;
8250 }
8251 true
8252}
8253
8254fn should_rescue_allcaps_heading(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8257 let trimmed = text.trim();
8258 if trimmed.is_empty() {
8259 return false;
8260 }
8261
8262 let word_count = trimmed.split_whitespace().count();
8263
8264 if word_count > 8 || trimmed.len() > 80 {
8266 return false;
8267 }
8268
8269 let alpha_chars: Vec<char> = trimmed.chars().filter(|c| c.is_alphabetic()).collect();
8271 if alpha_chars.len() < 2 || !alpha_chars.iter().all(|c| c.is_uppercase()) {
8272 return false;
8273 }
8274
8275 if trimmed.ends_with(['.', ';', ',']) {
8277 return false;
8278 }
8279
8280 if should_demote_math_heading(trimmed) || should_demote_percentage_heading(trimmed) {
8282 return false;
8283 }
8284
8285 if starts_with_caption_prefix(trimmed) {
8287 return false;
8288 }
8289
8290 if trimmed
8292 .chars()
8293 .all(|c| c.is_ascii_digit() || c == '.' || c == ' ')
8294 {
8295 return false;
8296 }
8297
8298 for offset in 1..=4 {
8301 let lookahead_idx = idx + offset;
8302 if lookahead_idx >= doc.kids.len() {
8303 break;
8304 }
8305 let look_elem = &doc.kids[lookahead_idx];
8306 match look_elem {
8307 ContentElement::Paragraph(p) => {
8308 let nw = p.base.value().split_whitespace().count();
8309 if nw > 6 {
8310 return true;
8311 }
8312 }
8313 ContentElement::TextBlock(tb) => {
8314 let nw = tb.value().split_whitespace().count();
8315 if nw > 6 {
8316 return true;
8317 }
8318 }
8319 ContentElement::TextLine(tl) => {
8320 let nw = tl.value().split_whitespace().count();
8321 if nw > 6 {
8322 return true;
8323 }
8324 }
8325 ContentElement::List(_)
8326 | ContentElement::Table(_)
8327 | ContentElement::TableBorder(_)
8328 | ContentElement::Image(_)
8329 | ContentElement::Figure(_) => {
8330 return true;
8331 }
8332 _ => continue,
8333 }
8334 }
8335
8336 false
8337}
8338
8339fn should_render_element_as_heading(
8340 element: &ContentElement,
8341 text: &str,
8342 next: Option<&ContentElement>,
8343) -> bool {
8344 let trimmed = text.trim();
8345 if trimmed.is_empty() {
8346 return false;
8347 }
8348
8349 let lower = trimmed.to_ascii_lowercase();
8350 if matches!(lower.as_str(), "contents" | "table of contents")
8351 && trimmed.starts_with(|c: char| c.is_uppercase())
8352 {
8353 return true;
8354 }
8355
8356 let word_count = trimmed.split_whitespace().count();
8357 let has_alpha = trimmed.chars().any(char::is_alphabetic);
8358 let title_like = has_alpha
8359 && word_count <= 4
8360 && trimmed.len() <= 40
8361 && !trimmed.ends_with(['.', '!', '?', ';', ':']);
8362
8363 let is_attribution = {
8367 let lower = trimmed.to_ascii_lowercase();
8368 lower.starts_with("source:")
8369 || lower.starts_with("credit:")
8370 || lower.starts_with("photo by ")
8371 || lower.starts_with("photo credit")
8372 || lower.starts_with("image by ")
8373 || lower.starts_with("image credit")
8374 };
8375
8376 title_like
8377 && matches!(next, Some(ContentElement::List(_)))
8378 && !looks_like_chart_label_heading(element, trimmed)
8379 && !is_attribution
8380}
8381
8382fn looks_like_top_margin_running_header(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8383 let trimmed = text.trim();
8384 if trimmed.is_empty() || trimmed.split_whitespace().count() > 6 {
8385 return false;
8386 }
8387
8388 let element = &doc.kids[idx];
8389 let bbox = element.bbox();
8390 if bbox.height() > 24.0 {
8391 return false;
8392 }
8393
8394 let Some(page) = element.page_number() else {
8395 return false;
8396 };
8397
8398 let mut page_tops = std::collections::HashMap::<u32, f64>::new();
8400 for candidate in &doc.kids {
8401 if let Some(p) = candidate.page_number() {
8402 let top = page_tops.entry(p).or_insert(f64::MIN);
8403 *top = top.max(candidate.bbox().top_y);
8404 }
8405 }
8406
8407 let page_top = page_tops.get(&page).copied().unwrap_or(0.0);
8408 if bbox.top_y < page_top - 24.0 {
8409 return false;
8410 }
8411
8412 let trimmed_lower = trimmed.to_lowercase();
8416 for other_elem in &doc.kids {
8417 let Some(other_page) = other_elem.page_number() else {
8418 continue;
8419 };
8420 if other_page == page {
8421 continue;
8422 }
8423 let other_bbox = other_elem.bbox();
8424 if other_bbox.height() > 24.0 {
8425 continue;
8426 }
8427 let other_top = page_tops.get(&other_page).copied().unwrap_or(0.0);
8428 if other_bbox.top_y < other_top - 24.0 {
8429 continue;
8430 }
8431 let other_text = match other_elem {
8432 ContentElement::Paragraph(p) => p.base.value(),
8433 ContentElement::TextBlock(tb) => tb.value(),
8434 ContentElement::TextLine(tl) => tl.value(),
8435 ContentElement::Heading(h) => h.base.base.value(),
8436 _ => continue,
8437 };
8438 if other_text.trim().to_lowercase() == trimmed_lower {
8439 return true;
8440 }
8441 }
8442
8443 false
8444}
8445
8446fn looks_like_chart_label_heading(element: &ContentElement, text: &str) -> bool {
8447 let trimmed = text.trim();
8448 let upper_words = trimmed
8449 .split_whitespace()
8450 .filter(|word| word.chars().any(char::is_alphabetic))
8451 .all(|word| {
8452 word.chars()
8453 .filter(|ch| ch.is_alphabetic())
8454 .all(|ch| ch.is_uppercase())
8455 });
8456
8457 (trimmed.contains('%') || upper_words) && element.bbox().height() <= 40.0
8458}
8459
8460fn should_demote_heading_to_paragraph(text: &str, next: &str) -> bool {
8461 let next_trimmed = next.trim();
8462 if !next_trimmed.chars().next().is_some_and(char::is_lowercase) {
8463 return false;
8464 }
8465
8466 let normalized = normalize_heading_text(text);
8467 if matches!(
8468 normalized.as_str(),
8469 "contents" | "tableofcontents" | "introduction" | "conclusion"
8470 ) {
8471 return false;
8472 }
8473
8474 let words: Vec<&str> = text.split_whitespace().collect();
8475 if words.len() < 3 {
8476 return false;
8477 }
8478
8479 words
8480 .last()
8481 .is_some_and(|word| is_sentence_fragment_tail(word))
8482}
8483
8484fn is_sentence_fragment_tail(word: &str) -> bool {
8485 matches!(
8486 word.trim_matches(|c: char| !c.is_alphanumeric())
8487 .to_ascii_lowercase()
8488 .as_str(),
8489 "a" | "an"
8490 | "and"
8491 | "as"
8492 | "at"
8493 | "by"
8494 | "for"
8495 | "from"
8496 | "in"
8497 | "into"
8498 | "of"
8499 | "on"
8500 | "or"
8501 | "that"
8502 | "the"
8503 | "to"
8504 | "with"
8505 )
8506}
8507
8508fn is_list_section_heading(text: &str) -> bool {
8509 let trimmed = text.trim();
8510 trimmed.ends_with(':')
8511 && trimmed.len() <= 80
8512 && trimmed.split_whitespace().count() <= 8
8513 && trimmed.chars().any(char::is_alphabetic)
8514 && !trimmed.chars().next().is_some_and(|c| c.is_ascii_digit())
8515 && !trimmed.starts_with(|c: char| "•‣◦●○◆◇▪▫–—-".contains(c))
8516}
8517
8518fn should_merge_paragraph_text(prev: &str, next: &str) -> bool {
8519 let next_trimmed = next.trim();
8520 if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
8521 return false;
8522 }
8523
8524 if starts_with_enumerated_marker(next_trimmed) {
8525 return false;
8526 }
8527
8528 if prev.ends_with('-')
8529 && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
8530 && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8531 {
8532 return true;
8533 }
8534
8535 if next_trimmed.chars().next().is_some_and(char::is_lowercase) {
8536 return true;
8537 }
8538
8539 let lower = next_trimmed.to_ascii_lowercase();
8540 if lower.starts_with("http://")
8541 || lower.starts_with("https://")
8542 || lower.starts_with("arxiv")
8543 || lower.starts_with("doi:")
8544 {
8545 return true;
8546 }
8547
8548 if matches!(
8549 next_trimmed.split_whitespace().next(),
8550 Some("In" | "Proceedings" | "Advances" | "Learning")
8551 ) {
8552 return true;
8553 }
8554
8555 !prev.ends_with(['.', '!', '?', ':'])
8556}
8557
8558fn should_merge_adjacent_semantic_paragraphs(prev: &str, next: &str) -> bool {
8559 let next_trimmed = next.trim();
8560 if next_trimmed.is_empty() {
8561 return false;
8562 }
8563
8564 if starts_with_enumerated_marker(next_trimmed) {
8565 return false;
8566 }
8567
8568 if prev.ends_with('-')
8569 && prev.chars().rev().nth(1).is_some_and(|c| c.is_alphabetic())
8570 && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8571 {
8572 return true;
8573 }
8574
8575 next_trimmed.chars().next().is_some_and(char::is_lowercase)
8576}
8577
8578fn starts_with_enumerated_marker(text: &str) -> bool {
8579 let first_token = match text.split_whitespace().next() {
8580 Some(token) => token.trim_start_matches(['(', '[']),
8581 None => return false,
8582 };
8583 if !first_token.ends_with(['.', ')', ':']) {
8584 return false;
8585 }
8586
8587 let marker = first_token.trim_end_matches(['.', ')', ':']);
8588 if marker.is_empty() {
8589 return false;
8590 }
8591
8592 if marker.chars().all(|c| c.is_ascii_digit()) {
8593 return true;
8594 }
8595
8596 if marker.len() == 1 && marker.chars().all(|c| c.is_ascii_alphabetic()) {
8597 return true;
8598 }
8599
8600 let lower = marker.to_ascii_lowercase();
8601 lower.len() <= 8 && lower.chars().all(|c| "ivxlcdm".contains(c))
8602}
8603
8604fn should_skip_leading_figure_carryover(doc: &PdfDocument, idx: usize, text: &str) -> bool {
8605 let trimmed = text.trim();
8606 if !trimmed.starts_with("Figure ") || trimmed.split_whitespace().count() < 4 {
8607 return false;
8608 }
8609
8610 let element = &doc.kids[idx];
8611 let Some(page) = element.page_number() else {
8612 return false;
8613 };
8614
8615 let mut page_top = f64::MIN;
8616 for candidate in &doc.kids {
8617 if candidate.page_number() == Some(page)
8618 && matches!(
8619 candidate,
8620 ContentElement::Paragraph(_)
8621 | ContentElement::TextBlock(_)
8622 | ContentElement::TextLine(_)
8623 | ContentElement::Heading(_)
8624 | ContentElement::NumberHeading(_)
8625 | ContentElement::Caption(_)
8626 )
8627 {
8628 page_top = page_top.max(candidate.bbox().top_y);
8629 }
8630 }
8631 if !page_top.is_finite() || element.bbox().top_y < page_top - 72.0 {
8632 return false;
8633 }
8634
8635 for prior_idx in 0..idx {
8636 let prior = &doc.kids[prior_idx];
8637 let prior_text = extract_element_text(prior);
8638 let prior_trimmed = prior_text.trim();
8639 if prior_trimmed.is_empty()
8640 || is_standalone_page_number(prior_trimmed)
8641 || looks_like_footer_banner(prior_trimmed)
8642 {
8643 continue;
8644 }
8645 match prior {
8646 ContentElement::Paragraph(_)
8647 | ContentElement::TextBlock(_)
8648 | ContentElement::TextLine(_) => {
8649 if !starts_with_caption_prefix(prior_trimmed)
8650 && !looks_like_top_margin_running_header(doc, prior_idx, prior_trimmed)
8651 {
8652 return false;
8653 }
8654 }
8655 ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
8656 if !should_skip_heading_text(prior_trimmed) {
8657 return false;
8658 }
8659 }
8660 _ => return false,
8661 }
8662 }
8663
8664 for lookahead_idx in idx + 1..doc.kids.len().min(idx + 8) {
8665 let next = &doc.kids[lookahead_idx];
8666 if next.page_number() != Some(page) {
8667 break;
8668 }
8669 let next_text = extract_element_text(next);
8670 let next_trimmed = next_text.trim();
8671 if next_trimmed.is_empty() || is_standalone_page_number(next_trimmed) {
8672 continue;
8673 }
8674
8675 let is_numbered_heading = match next {
8676 ContentElement::Heading(_) | ContentElement::NumberHeading(_) => {
8677 looks_like_numbered_section(next_trimmed)
8678 || looks_like_keyword_numbered_section(next_trimmed)
8679 }
8680 ContentElement::Paragraph(_)
8681 | ContentElement::TextBlock(_)
8682 | ContentElement::TextLine(_) => {
8683 should_render_paragraph_as_heading(
8684 doc,
8685 lookahead_idx,
8686 next_trimmed,
8687 doc.kids.get(lookahead_idx + 1),
8688 ) && (looks_like_numbered_section(next_trimmed)
8689 || looks_like_keyword_numbered_section(next_trimmed))
8690 }
8691 _ => false,
8692 };
8693
8694 if is_numbered_heading {
8695 return true;
8696 }
8697
8698 if !starts_with_caption_prefix(next_trimmed) && next_trimmed.split_whitespace().count() >= 5
8699 {
8700 return false;
8701 }
8702 }
8703
8704 false
8705}
8706
8707fn merge_paragraph_text(target: &mut String, next: &str) {
8708 let next_trimmed = next.trim();
8709 if target.ends_with('-')
8710 && target
8711 .chars()
8712 .rev()
8713 .nth(1)
8714 .is_some_and(|c| c.is_alphabetic())
8715 && next_trimmed.chars().next().is_some_and(char::is_lowercase)
8716 {
8717 target.pop();
8718 target.push_str(next_trimmed);
8719 } else {
8720 if !target.ends_with(' ') {
8721 target.push(' ');
8722 }
8723 target.push_str(next_trimmed);
8724 }
8725}
8726
8727fn is_standalone_page_number(text: &str) -> bool {
8728 let trimmed = text.trim();
8729 !trimmed.is_empty() && trimmed.len() <= 4 && trimmed.chars().all(|c| c.is_ascii_digit())
8730}
8731
8732fn looks_like_margin_page_number(doc: &PdfDocument, element: &ContentElement, text: &str) -> bool {
8733 if !is_standalone_page_number(text) {
8734 return false;
8735 }
8736
8737 let bbox = element.bbox();
8738 if bbox.height() > 24.0 {
8739 return false;
8740 }
8741
8742 let Some(page) = element.page_number() else {
8743 return false;
8744 };
8745
8746 let mut page_top = f64::MIN;
8747 let mut page_bottom = f64::MAX;
8748 for candidate in &doc.kids {
8749 if candidate.page_number() == Some(page) {
8750 let candidate_bbox = candidate.bbox();
8751 page_top = page_top.max(candidate_bbox.top_y);
8752 page_bottom = page_bottom.min(candidate_bbox.bottom_y);
8753 }
8754 }
8755
8756 if !page_top.is_finite() || !page_bottom.is_finite() {
8757 return false;
8758 }
8759
8760 bbox.top_y >= page_top - 24.0 || bbox.bottom_y <= page_bottom + 24.0
8761}
8762
8763fn looks_like_bottom_margin_heading(doc: &PdfDocument, idx: usize) -> bool {
8768 let element = &doc.kids[idx];
8769 let bbox = element.bbox();
8770 if bbox.height() > 30.0 {
8771 return false;
8772 }
8773
8774 let Some(page) = element.page_number() else {
8775 return false;
8776 };
8777
8778 let mut page_bottom = f64::MAX;
8779 for candidate in &doc.kids {
8780 if candidate.page_number() == Some(page) {
8781 page_bottom = page_bottom.min(candidate.bbox().bottom_y);
8782 }
8783 }
8784
8785 if !page_bottom.is_finite() {
8786 return false;
8787 }
8788
8789 bbox.bottom_y <= page_bottom + 24.0
8791}
8792
8793fn should_demote_period_heading(text: &str) -> bool {
8797 let trimmed = text.trim();
8798 if !trimmed.ends_with('.') {
8799 return false;
8800 }
8801 if looks_like_numbered_section(trimmed) || looks_like_keyword_numbered_section(trimmed) {
8804 return false;
8805 }
8806 let without_dot = trimmed.trim_end_matches('.');
8810 let word_count = without_dot.split_whitespace().count();
8811 if word_count <= 2 {
8814 return true;
8815 }
8816 false
8817}
8818
8819fn should_demote_comma_heading(text: &str) -> bool {
8822 text.trim().ends_with(',')
8823}
8824
8825fn should_demote_math_heading(text: &str) -> bool {
8828 text.chars().any(|c| {
8829 matches!(
8830 c,
8831 '¼' | '½'
8832 | '¾'
8833 | '≪'
8834 | '≫'
8835 | 'þ'
8836 | 'ð'
8837 | '∑'
8838 | '∫'
8839 | '∂'
8840 | '∏'
8841 | '√'
8842 | '∞'
8843 | '≈'
8844 | '÷'
8845 )
8846 })
8847}
8848
8849fn should_demote_percentage_heading(text: &str) -> bool {
8852 text.contains('%')
8853}
8854
8855fn should_demote_bibliography_heading(text: &str) -> bool {
8858 let t = text.trim();
8859 if t.len() < 6 {
8860 return false;
8861 }
8862 let bytes = t.as_bytes();
8863 bytes[0..4].iter().all(|b| b.is_ascii_digit())
8864 && bytes[4] == b'.'
8865 && (bytes[5] == b' ' || t.len() == 5)
8866}
8867
8868fn strip_trailing_page_number(text: &str) -> &str {
8873 let trimmed = text.trim();
8874 if let Some(last_space) = trimmed.rfind(' ') {
8875 let suffix = &trimmed[last_space + 1..];
8876 if !suffix.is_empty()
8877 && suffix.len() <= 4
8878 && suffix.chars().all(|c| c.is_ascii_digit())
8879 && trimmed[..last_space].split_whitespace().count() >= 3
8880 {
8881 return trimmed[..last_space].trim();
8882 }
8883 }
8884 trimmed
8885}
8886
8887fn find_merged_subsection_split(text: &str) -> Option<usize> {
8892 let bytes = text.as_bytes();
8895 let mut i = 3;
8897 while i < bytes.len() {
8898 if bytes[i - 1] == b' ' {
8899 if bytes[i].is_ascii_digit() {
8901 if let Some(dot_pos) = text[i..].find('.') {
8902 let after_dot = i + dot_pos + 1;
8903 if after_dot < bytes.len() && bytes[after_dot].is_ascii_digit() {
8904 return Some(i);
8906 }
8907 }
8908 }
8909 if bytes[i].is_ascii_uppercase()
8911 && i + 2 < bytes.len()
8912 && bytes[i + 1] == b'.'
8913 && bytes[i + 2].is_ascii_digit()
8914 {
8915 return Some(i);
8916 }
8917 }
8918 i += 1;
8919 }
8920 None
8921}
8922
8923fn should_skip_heading_text(text: &str) -> bool {
8924 let trimmed = text.trim();
8925 if trimmed.is_empty() || is_standalone_page_number(trimmed) {
8926 return true;
8927 }
8928
8929 let lower = trimmed.to_ascii_lowercase();
8930 if (lower.starts_with("chapter ") || lower.chars().next().is_some_and(|c| c.is_ascii_digit()))
8931 && trimmed.contains('|')
8932 {
8933 return true;
8934 }
8935
8936 let alpha_count = trimmed.chars().filter(|c| c.is_alphabetic()).count();
8937 let alnum_count = trimmed.chars().filter(|c| c.is_alphanumeric()).count();
8938 alpha_count == 0 || (alnum_count > 0 && alpha_count * 3 < alnum_count && !trimmed.contains(':'))
8939}
8940
8941fn repair_fragmented_words(text: &str) -> String {
8942 const STOPWORDS: &[&str] = &[
8943 "a", "an", "and", "are", "as", "at", "be", "by", "can", "for", "from", "if", "in", "into",
8944 "is", "it", "may", "must", "not", "of", "on", "or", "per", "that", "the", "to", "with",
8945 ];
8946
8947 let mut parts: Vec<String> = text.split_whitespace().map(str::to_string).collect();
8948 if parts.len() < 2 {
8949 return text.to_string();
8950 }
8951
8952 let mut i = 0usize;
8953 while i + 1 < parts.len() {
8954 let left = parts[i].clone();
8955 let right = parts[i + 1].clone();
8956 let left_clean = left.trim_matches(|c: char| !c.is_alphabetic());
8957 let right_clean = right.trim_matches(|c: char| !c.is_alphabetic());
8958 let left_lower = left_clean.to_ascii_lowercase();
8959 let right_lower = right_clean.to_ascii_lowercase();
8960
8961 let should_join = !left_clean.is_empty()
8962 && !right_clean.is_empty()
8963 && left_clean.chars().all(char::is_alphabetic)
8964 && right_clean.chars().all(char::is_alphabetic)
8965 && (left_clean.len() <= 4 || right_clean.len() <= 4)
8966 && left_clean.len() + right_clean.len() >= 6
8967 && !right_clean.chars().next().is_some_and(char::is_uppercase)
8968 && !STOPWORDS.contains(&left_lower.as_str())
8969 && !STOPWORDS.contains(&right_lower.as_str());
8970
8971 if should_join {
8972 let next = parts.remove(i + 1);
8973 parts[i].push_str(&next);
8974 } else {
8975 i += 1;
8976 }
8977 }
8978
8979 parts.join(" ")
8980}
8981
8982fn list_item_text_from_contents(contents: &[ContentElement]) -> String {
8984 let mut text = String::new();
8985 for elem in contents {
8986 let part = match elem {
8987 ContentElement::Paragraph(p) => p.base.value(),
8988 ContentElement::TextBlock(tb) => tb.value(),
8989 ContentElement::TextLine(tl) => tl.value(),
8990 ContentElement::TextChunk(tc) => tc.value.clone(),
8991 _ => String::new(),
8992 };
8993 if !text.is_empty() && !part.is_empty() {
8994 text.push(' ');
8995 }
8996 text.push_str(&part);
8997 }
8998 text
8999}
9000
9001fn has_internal_header_gap(row: &[String]) -> bool {
9002 let mut seen_filled = false;
9003 let mut seen_gap_after_fill = false;
9004 for cell in row {
9005 if cell.trim().is_empty() {
9006 if seen_filled {
9007 seen_gap_after_fill = true;
9008 }
9009 continue;
9010 }
9011 if seen_gap_after_fill {
9012 return true;
9013 }
9014 seen_filled = true;
9015 }
9016 false
9017}
9018
9019fn expand_grouped_header_row(parent: &[String], child: &[String]) -> Vec<String> {
9020 let anchor_cols: Vec<usize> = parent
9021 .iter()
9022 .enumerate()
9023 .filter_map(|(idx, cell)| (!cell.trim().is_empty()).then_some(idx))
9024 .collect();
9025 if anchor_cols.is_empty() {
9026 return parent.to_vec();
9027 }
9028
9029 let mut expanded = parent.to_vec();
9030 for (col_idx, child_cell) in child.iter().enumerate() {
9031 if !expanded[col_idx].trim().is_empty() || child_cell.trim().is_empty() {
9032 continue;
9033 }
9034
9035 let mut best_anchor = anchor_cols[0];
9036 let mut best_distance = usize::abs_diff(anchor_cols[0], col_idx);
9037 for &anchor_idx in &anchor_cols[1..] {
9038 let distance = usize::abs_diff(anchor_idx, col_idx);
9039 if distance < best_distance || (distance == best_distance && anchor_idx > best_anchor) {
9040 best_anchor = anchor_idx;
9041 best_distance = distance;
9042 }
9043 }
9044 expanded[col_idx] = parent[best_anchor].trim().to_string();
9045 }
9046
9047 expanded
9048}
9049
9050fn preserve_grouped_header_rows(rows: &mut [Vec<String>]) -> bool {
9051 if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
9052 return false;
9053 }
9054 if rows[0].first().is_none_or(|cell| cell.trim().is_empty()) {
9055 return false;
9056 }
9057 if rows[1].first().is_some_and(|cell| !cell.trim().is_empty()) {
9058 return false;
9059 }
9060
9061 let first_filled = rows[0]
9062 .iter()
9063 .filter(|cell| !cell.trim().is_empty())
9064 .count();
9065 let second_filled = rows[1]
9066 .iter()
9067 .filter(|cell| !cell.trim().is_empty())
9068 .count();
9069 if first_filled < 2 || second_filled <= first_filled || !has_internal_header_gap(&rows[0]) {
9070 return false;
9071 }
9072
9073 rows[0] = expand_grouped_header_row(&rows[0], &rows[1]);
9074 true
9075}
9076
9077fn merge_continuation_rows(rows: &mut Vec<Vec<String>>) {
9088 if rows.len() < 2 {
9089 return;
9090 }
9091 if preserve_grouped_header_rows(rows) {
9092 return;
9093 }
9094 if rows[0].first().is_none_or(|c| c.trim().is_empty()) {
9096 return;
9097 }
9098
9099 let mut merge_count = 0usize;
9100 for (i, row_i) in rows.iter().enumerate().skip(1) {
9101 let first_empty = row_i.first().is_none_or(|c| c.trim().is_empty());
9102 if !first_empty {
9103 break; }
9105 let all_short = row_i
9107 .iter()
9108 .all(|c| c.trim().is_empty() || c.trim().len() <= 30);
9109 if !all_short {
9110 break;
9111 }
9112 merge_count = i;
9113 }
9114
9115 if merge_count == 0 {
9118 return;
9119 }
9120
9121 for i in 1..=merge_count {
9123 let (head, tail) = rows.split_at_mut(i);
9124 let ncols = head[0].len().min(tail[0].len());
9125 for (target, src) in head[0]
9126 .iter_mut()
9127 .take(ncols)
9128 .zip(tail[0].iter().take(ncols))
9129 {
9130 let fragment = src.trim().to_string();
9131 if !fragment.is_empty() {
9132 let target_str = target.trim().to_string();
9133 *target = if target_str.is_empty() {
9134 fragment
9135 } else {
9136 format!("{} {}", target_str, fragment)
9137 };
9138 }
9139 }
9140 }
9141
9142 rows.drain(1..=merge_count);
9144}
9145
9146fn trim_leading_table_carryover_rows(rows: &mut Vec<Vec<String>>) {
9147 while first_body_row_looks_like_carryover(rows) {
9148 rows.remove(1);
9149 }
9150}
9151
9152fn first_body_row_looks_like_carryover(rows: &[Vec<String>]) -> bool {
9153 if rows.len() < 3 {
9154 return false;
9155 }
9156
9157 let key_col_count = infer_leading_key_column_count(&rows[1..]);
9158 if key_col_count == 0 {
9159 return false;
9160 }
9161
9162 let candidate = &rows[1];
9163 if candidate
9164 .iter()
9165 .take(key_col_count)
9166 .any(|cell| !cell.trim().is_empty())
9167 {
9168 return false;
9169 }
9170
9171 let non_empty_cols = candidate
9172 .iter()
9173 .enumerate()
9174 .filter(|(_, cell)| !cell.trim().is_empty())
9175 .map(|(idx, _)| idx)
9176 .collect::<Vec<_>>();
9177 if non_empty_cols.len() != 1 {
9178 return false;
9179 }
9180
9181 let only_col = non_empty_cols[0];
9182 if only_col < key_col_count {
9183 return false;
9184 }
9185
9186 if candidate[only_col].split_whitespace().count() < 4 {
9187 return false;
9188 }
9189
9190 rows[2]
9191 .iter()
9192 .take(key_col_count)
9193 .all(|cell| !cell.trim().is_empty())
9194}
9195
9196fn infer_leading_key_column_count(rows: &[Vec<String>]) -> usize {
9197 if rows.len() < 2 {
9198 return 0;
9199 }
9200
9201 let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
9202 let mut key_cols = 0usize;
9203
9204 for col_idx in 0..num_cols {
9205 let mut occupancy = 0usize;
9206 let mut word_counts = Vec::new();
9207
9208 for row in rows {
9209 let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
9210 let trimmed = cell.trim();
9211 if trimmed.is_empty() {
9212 continue;
9213 }
9214 occupancy += 1;
9215 word_counts.push(trimmed.split_whitespace().count());
9216 }
9217
9218 if occupancy == 0 {
9219 break;
9220 }
9221
9222 word_counts.sort_unstable();
9223 let median_words = word_counts[word_counts.len() / 2];
9224 let occupancy_ratio = occupancy as f64 / rows.len() as f64;
9225 if occupancy_ratio < 0.6 || median_words > 3 {
9226 break;
9227 }
9228 key_cols += 1;
9229 }
9230
9231 key_cols
9232}
9233
9234fn render_table(out: &mut String, table: &crate::models::semantic::SemanticTable) {
9236 render_table_border(out, &table.table_border);
9238}
9239
9240#[derive(Clone, Debug)]
9241struct GeometricTableRegion {
9242 start_idx: usize,
9243 end_idx: usize,
9244 rendered: String,
9245}
9246
9247#[derive(Clone)]
9248struct ChunkLine {
9249 bbox: BoundingBox,
9250 chunks: Vec<TextChunk>,
9251}
9252
9253#[derive(Clone)]
9254struct SlotFragment {
9255 slot_idx: usize,
9256 bbox: BoundingBox,
9257 text: String,
9258}
9259
9260fn detect_geometric_table_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
9261 let mut regions = Vec::new();
9262 let mut occupied_until = 0usize;
9263
9264 for (idx, element) in doc.kids.iter().enumerate() {
9265 if idx < occupied_until {
9266 continue;
9267 }
9268
9269 let Some(table) = table_border_from_element(element) else {
9270 continue;
9271 };
9272 let Some(region) = build_geometric_table_region(doc, idx, table) else {
9273 continue;
9274 };
9275 occupied_until = region.end_idx.saturating_add(1);
9276 regions.push(region);
9277 }
9278
9279 let mut occupied = regions
9280 .iter()
9281 .flat_map(|region| region.start_idx..=region.end_idx)
9282 .collect::<HashSet<_>>();
9283 for region in detect_footnote_citation_regions(doc) {
9284 if (region.start_idx..=region.end_idx).any(|idx| occupied.contains(&idx)) {
9285 continue;
9286 }
9287 occupied.extend(region.start_idx..=region.end_idx);
9288 regions.push(region);
9289 }
9290
9291 regions.sort_by_key(|region| region.start_idx);
9292 regions
9293}
9294
9295fn detect_footnote_citation_regions(doc: &PdfDocument) -> Vec<GeometricTableRegion> {
9296 let body_font_size = compute_running_body_font_size(doc);
9297 if body_font_size <= 0.0 {
9298 return Vec::new();
9299 }
9300
9301 let mut regions = Vec::new();
9302 let mut idx = 0usize;
9303 while idx < doc.kids.len() {
9304 let Some(region) = build_footnote_citation_region(doc, idx, body_font_size) else {
9305 idx += 1;
9306 continue;
9307 };
9308 idx = region.end_idx.saturating_add(1);
9309 regions.push(region);
9310 }
9311
9312 regions
9313}
9314
9315fn compute_running_body_font_size(doc: &PdfDocument) -> f64 {
9316 doc.kids
9317 .iter()
9318 .filter_map(|element| {
9319 let ContentElement::Paragraph(paragraph) = element else {
9320 return None;
9321 };
9322 let text = paragraph.base.value();
9323 (text.split_whitespace().count() > 10).then_some(paragraph.base.font_size?)
9324 })
9325 .fold(0.0_f64, f64::max)
9326}
9327
9328fn build_footnote_citation_region(
9329 doc: &PdfDocument,
9330 start_idx: usize,
9331 body_font_size: f64,
9332) -> Option<GeometricTableRegion> {
9333 let element = doc.kids.get(start_idx)?;
9334 if !is_geometric_text_candidate(element) {
9335 return None;
9336 }
9337
9338 let start_text = extract_element_text(element);
9339 let trimmed_start = start_text.trim();
9340 if trimmed_start.is_empty() {
9341 return None;
9342 }
9343
9344 let small_font_threshold = (body_font_size * 0.92).min(body_font_size - 0.8).max(0.0);
9345 let mut lead_prefix = None;
9346 let mut fragments = Vec::new();
9347 let page_number = element.page_number()?;
9348 let mut column_bbox = element.bbox().clone();
9349 let mut region_start_idx = start_idx;
9350 let mut end_idx = start_idx;
9351
9352 if element_font_size(element).is_some_and(|font_size| font_size <= small_font_threshold)
9353 && starts_with_footnote_marker(trimmed_start)
9354 {
9355 if let Some((attach_idx, prefix, leading_fragments)) = leading_footnote_attachment(
9356 doc,
9357 start_idx,
9358 page_number,
9359 &column_bbox,
9360 small_font_threshold,
9361 ) {
9362 lead_prefix = Some(prefix);
9363 fragments.extend(leading_fragments);
9364 region_start_idx = attach_idx;
9365 }
9366 fragments.push(footnote_fragment_text(element));
9367 } else {
9368 let (prefix, first_tail) = split_trailing_footnote_lead(trimmed_start)?;
9369 let next = doc.kids.get(start_idx + 1)?;
9370 if !is_geometric_text_candidate(next)
9371 || next.page_number() != Some(page_number)
9372 || !element_font_size(next).is_some_and(|font_size| font_size <= small_font_threshold)
9373 {
9374 return None;
9375 }
9376 if !same_column_region(&column_bbox, next.bbox()) {
9377 return None;
9378 }
9379 lead_prefix = Some(prefix);
9380 fragments.push(first_tail);
9381 }
9382
9383 let mut consecutive_small = 0usize;
9384 for idx in start_idx + 1..doc.kids.len() {
9385 let candidate = &doc.kids[idx];
9386 if !is_geometric_text_candidate(candidate) || candidate.page_number() != Some(page_number) {
9387 break;
9388 }
9389
9390 let candidate_text = extract_element_text(candidate);
9391 let trimmed = candidate_text.trim();
9392 if trimmed.is_empty() || starts_with_caption_prefix(trimmed) {
9393 break;
9394 }
9395
9396 let Some(font_size) = element_font_size(candidate) else {
9397 break;
9398 };
9399 if font_size > small_font_threshold {
9400 break;
9401 }
9402 if !same_column_region(&column_bbox, candidate.bbox()) {
9403 break;
9404 }
9405
9406 column_bbox = column_bbox.union(candidate.bbox());
9407 fragments.push(footnote_fragment_text(candidate));
9408 consecutive_small += 1;
9409 end_idx = idx;
9410 }
9411
9412 if consecutive_small == 0 && lead_prefix.is_some() {
9413 return None;
9414 }
9415
9416 let rows = parse_footnote_citation_rows(&fragments);
9417 if rows.len() < 3 {
9418 return None;
9419 }
9420
9421 let numeric_markers = rows
9422 .iter()
9423 .filter_map(|(marker, _)| marker.parse::<u32>().ok())
9424 .collect::<Vec<_>>();
9425 if numeric_markers.len() != rows.len() {
9426 return None;
9427 }
9428 let sequential_steps = numeric_markers
9429 .windows(2)
9430 .filter(|pair| pair[1] == pair[0] + 1)
9431 .count();
9432 if sequential_steps + 1 < rows.len().saturating_sub(1) {
9433 return None;
9434 }
9435
9436 let mut rendered_rows = vec![vec!["Footnote".to_string(), "Citation".to_string()]];
9437 rendered_rows.extend(
9438 rows.into_iter()
9439 .map(|(marker, citation)| vec![marker, citation]),
9440 );
9441
9442 let mut rendered = String::new();
9443 if let Some(prefix) = lead_prefix {
9444 rendered.push_str(&escape_md_line_start(prefix.trim()));
9445 rendered.push_str("\n\n");
9446 }
9447 rendered.push_str(&render_html_table(&rendered_rows));
9448
9449 Some(GeometricTableRegion {
9450 start_idx: region_start_idx,
9451 end_idx,
9452 rendered,
9453 })
9454}
9455
9456fn leading_footnote_attachment(
9457 doc: &PdfDocument,
9458 start_idx: usize,
9459 page_number: u32,
9460 column_bbox: &BoundingBox,
9461 small_font_threshold: f64,
9462) -> Option<(usize, String, Vec<String>)> {
9463 let mut idx = start_idx.checked_sub(1)?;
9464 let mut leading_fragments = Vec::new();
9465 let mut scanned = 0usize;
9466
9467 loop {
9468 let candidate = doc.kids.get(idx)?;
9469 scanned += 1;
9470 if scanned > 6 || candidate.page_number() != Some(page_number) {
9471 return None;
9472 }
9473
9474 if !is_geometric_text_candidate(candidate) {
9475 if idx == 0 {
9476 return None;
9477 }
9478 idx -= 1;
9479 continue;
9480 }
9481
9482 let text = extract_element_text(candidate);
9483 let trimmed = text.trim();
9484 if trimmed.is_empty() {
9485 if idx == 0 {
9486 return None;
9487 }
9488 idx -= 1;
9489 continue;
9490 }
9491 if !same_column_region(candidate.bbox(), column_bbox) {
9492 return None;
9493 }
9494
9495 if element_font_size(candidate).is_some_and(|font_size| font_size <= small_font_threshold) {
9496 leading_fragments.push(footnote_fragment_text(candidate));
9497 if idx == 0 {
9498 return None;
9499 }
9500 idx -= 1;
9501 continue;
9502 }
9503
9504 let (prefix, first_tail) = split_trailing_footnote_lead(trimmed)?;
9505 leading_fragments.push(first_tail);
9506 leading_fragments.reverse();
9507 return Some((idx, prefix, leading_fragments));
9508 }
9509}
9510
9511fn parse_footnote_citation_rows(fragments: &[String]) -> Vec<(String, String)> {
9512 let mut rows = Vec::new();
9513 let mut current_marker = None::<String>;
9514 let mut current_citation = String::new();
9515
9516 for fragment in fragments {
9517 let markers = find_footnote_marker_positions(fragment);
9518 if markers.is_empty() {
9519 if current_marker.is_some() {
9520 merge_paragraph_text(&mut current_citation, fragment.trim());
9521 }
9522 continue;
9523 }
9524
9525 let mut cursor = 0usize;
9526 for (pos, marker, skip_len) in markers {
9527 let prefix = fragment[cursor..pos].trim();
9528 if current_marker.is_some() && !prefix.is_empty() {
9529 merge_paragraph_text(&mut current_citation, prefix);
9530 }
9531 if let Some(marker_value) = current_marker.take() {
9532 let trimmed = current_citation.trim();
9533 if !trimmed.is_empty() {
9534 rows.push((marker_value, trimmed.to_string()));
9535 }
9536 current_citation.clear();
9537 }
9538 current_marker = Some(marker);
9539 cursor = pos + skip_len;
9540 }
9541
9542 let tail = fragment[cursor..].trim();
9543 if current_marker.is_some() && !tail.is_empty() {
9544 merge_paragraph_text(&mut current_citation, tail);
9545 }
9546 }
9547
9548 if let Some(marker_value) = current_marker {
9549 let trimmed = current_citation.trim();
9550 if !trimmed.is_empty() {
9551 rows.push((marker_value, trimmed.to_string()));
9552 }
9553 }
9554
9555 rebalance_adjacent_footnote_citations(&mut rows);
9556 rows
9557}
9558
9559fn rebalance_adjacent_footnote_citations(rows: &mut [(String, String)]) {
9560 for idx in 0..rows.len().saturating_sub(1) {
9561 if !rows[idx].1.trim_end().ends_with(',') {
9562 continue;
9563 }
9564
9565 let next = rows[idx + 1].1.trim().to_string();
9566 let Some((stub, remainder)) = split_leading_citation_stub(&next) else {
9567 continue;
9568 };
9569 let Some((first_sentence, trailing)) = split_first_sentence(remainder) else {
9570 continue;
9571 };
9572 if first_sentence.split_whitespace().count() < 2 {
9573 continue;
9574 }
9575
9576 merge_paragraph_text(&mut rows[idx].1, first_sentence);
9577 rows[idx + 1].1 = if trailing.is_empty() {
9578 stub.to_string()
9579 } else {
9580 format!("{stub} {trailing}")
9581 };
9582 }
9583}
9584
9585fn split_leading_citation_stub(text: &str) -> Option<(&str, &str)> {
9586 let comma_idx = text.find(',')?;
9587 if comma_idx > 8 {
9588 return None;
9589 }
9590 let stub = text[..=comma_idx].trim();
9591 let remainder = text[comma_idx + 1..].trim();
9592 (!stub.is_empty() && !remainder.is_empty()).then_some((stub, remainder))
9593}
9594
9595fn split_first_sentence(text: &str) -> Option<(&str, &str)> {
9596 let period_idx = text.find(". ")?;
9597 let first = text[..=period_idx].trim();
9598 let trailing = text[period_idx + 2..].trim();
9599 (!first.is_empty()).then_some((first, trailing))
9600}
9601
9602fn find_footnote_marker_positions(text: &str) -> Vec<(usize, String, usize)> {
9603 let chars = text.char_indices().collect::<Vec<_>>();
9604 let mut markers = Vec::new();
9605 let mut idx = 0usize;
9606
9607 while idx < chars.len() {
9608 let (byte_idx, ch) = chars[idx];
9609 if !ch.is_ascii_digit() {
9610 idx += 1;
9611 continue;
9612 }
9613
9614 let at_boundary = idx == 0
9615 || chars[idx - 1].1.is_whitespace()
9616 || matches!(
9617 chars[idx - 1].1,
9618 '.' | ',' | ';' | ':' | ')' | ']' | '"' | '\'' | '”'
9619 );
9620 if !at_boundary {
9621 idx += 1;
9622 continue;
9623 }
9624
9625 let mut end_idx = idx;
9626 while end_idx < chars.len() && chars[end_idx].1.is_ascii_digit() {
9627 end_idx += 1;
9628 }
9629 let digits = &text[byte_idx
9630 ..chars
9631 .get(end_idx)
9632 .map(|(pos, _)| *pos)
9633 .unwrap_or(text.len())];
9634 if digits.len() > 2 || end_idx >= chars.len() || !chars[end_idx].1.is_whitespace() {
9635 idx += 1;
9636 continue;
9637 }
9638
9639 let mut lookahead = end_idx;
9640 while lookahead < chars.len() && chars[lookahead].1.is_whitespace() {
9641 lookahead += 1;
9642 }
9643 let Some((_, next_ch)) = chars.get(lookahead) else {
9644 idx += 1;
9645 continue;
9646 };
9647 if !(next_ch.is_ascii_uppercase() || matches!(*next_ch, '(' | '[' | '*')) {
9648 idx += 1;
9649 continue;
9650 }
9651
9652 let skip_end = chars
9653 .get(lookahead)
9654 .map(|(pos, _)| *pos)
9655 .unwrap_or(text.len());
9656 markers.push((byte_idx, digits.to_string(), skip_end - byte_idx));
9657 idx = lookahead;
9658 }
9659
9660 markers
9661}
9662
9663fn split_trailing_footnote_lead(text: &str) -> Option<(String, String)> {
9664 let markers = find_footnote_marker_positions(text);
9665 let (pos, marker, skip_len) = markers.last()?.clone();
9666 let prefix = text[..pos].trim();
9667 let tail = text[pos + skip_len..].trim();
9668 if prefix.split_whitespace().count() < 6 || tail.split_whitespace().count() > 6 {
9669 return None;
9670 }
9671 Some((prefix.to_string(), format!("{marker} {tail}")))
9672}
9673
9674fn starts_with_footnote_marker(text: &str) -> bool {
9675 find_footnote_marker_positions(text)
9676 .first()
9677 .is_some_and(|(pos, _, _)| *pos == 0)
9678}
9679
9680fn same_column_region(left: &BoundingBox, right: &BoundingBox) -> bool {
9681 let overlap = (left.right_x.min(right.right_x) - left.left_x.max(right.left_x)).max(0.0);
9682 let min_width = left.width().min(right.width()).max(1.0);
9683 overlap / min_width >= 0.35 || (left.left_x - right.left_x).abs() <= 28.0
9684}
9685
9686fn footnote_fragment_text(element: &ContentElement) -> String {
9687 let text = extract_element_text(element);
9688 if element_font_name(element)
9689 .as_deref()
9690 .is_some_and(|name| name.to_ascii_lowercase().contains("italic"))
9691 {
9692 format!("*{}*", text.trim())
9693 } else {
9694 text
9695 }
9696}
9697
9698fn element_font_size(element: &ContentElement) -> Option<f64> {
9699 match element {
9700 ContentElement::Paragraph(p) => p.base.font_size,
9701 ContentElement::Heading(h) => h.base.base.font_size,
9702 ContentElement::NumberHeading(nh) => nh.base.base.base.font_size,
9703 ContentElement::TextBlock(tb) => Some(tb.font_size),
9704 ContentElement::TextLine(tl) => Some(tl.font_size),
9705 _ => None,
9706 }
9707}
9708
9709fn element_font_name(element: &ContentElement) -> Option<String> {
9710 match element {
9711 ContentElement::Paragraph(p) => p.base.font_name.clone(),
9712 ContentElement::Heading(h) => h.base.base.font_name.clone(),
9713 ContentElement::NumberHeading(nh) => nh.base.base.base.font_name.clone(),
9714 _ => None,
9715 }
9716}
9717
9718fn table_border_from_element(
9719 element: &ContentElement,
9720) -> Option<&crate::models::table::TableBorder> {
9721 match element {
9722 ContentElement::TableBorder(table) => Some(table),
9723 ContentElement::Table(table) => Some(&table.table_border),
9724 _ => None,
9725 }
9726}
9727
9728fn build_geometric_table_region(
9729 doc: &PdfDocument,
9730 table_idx: usize,
9731 table: &crate::models::table::TableBorder,
9732) -> Option<GeometricTableRegion> {
9733 let mut table_rows = collect_table_border_rows(table);
9734 if table_rows.is_empty() || table.num_columns < 3 {
9735 return None;
9736 }
9737 merge_continuation_rows(&mut table_rows);
9738
9739 let column_ranges = table_column_ranges(table)?;
9740 let candidate_indices = collect_table_header_candidate_indices(doc, table_idx, table);
9741 if candidate_indices.is_empty() {
9742 return None;
9743 }
9744
9745 let needs_external_stub =
9746 infer_left_stub_requirement(doc, &candidate_indices, &table_rows, &column_ranges);
9747 let supports_embedded_stub_header =
9748 supports_embedded_stub_header(&table_rows, &column_ranges, doc, &candidate_indices);
9749 if !needs_external_stub && !supports_embedded_stub_header {
9750 return None;
9751 }
9752 let slot_ranges = if needs_external_stub {
9753 slot_ranges(&column_ranges, doc, &candidate_indices, true)?
9754 } else {
9755 column_ranges.clone()
9756 };
9757 let mut header_rows = reconstruct_aligned_rows(doc, &candidate_indices, &slot_ranges, true, 2);
9758 if header_rows.is_empty() {
9759 return None;
9760 }
9761 if needs_external_stub {
9762 normalize_leading_stub_header(&mut header_rows);
9763 } else {
9764 promote_embedded_stub_header(&mut header_rows, &table_rows);
9765 }
9766
9767 let slot_count = slot_ranges.len();
9768 let dense_header_rows = header_rows
9769 .iter()
9770 .filter(|row| {
9771 row.iter().filter(|cell| !cell.trim().is_empty()).count()
9772 >= slot_count.saturating_sub(1).max(2)
9773 })
9774 .count();
9775 if dense_header_rows == 0 {
9776 return None;
9777 }
9778
9779 let mut combined_rows = Vec::new();
9780 combined_rows.extend(header_rows);
9781
9782 let following_indices = collect_table_footer_candidate_indices(doc, table_idx, table);
9783 let body_rows = if needs_external_stub && should_merge_panel_body_rows(&table_rows) {
9784 let trailing_rows =
9785 reconstruct_aligned_rows(doc, &following_indices, &slot_ranges, false, 1);
9786 vec![merge_panel_body_row(
9787 &table_rows,
9788 &trailing_rows,
9789 slot_count,
9790 )]
9791 } else if needs_external_stub {
9792 table_rows
9793 .iter()
9794 .map(|row| {
9795 let mut shifted = vec![String::new()];
9796 shifted.extend(row.iter().cloned());
9797 shifted
9798 })
9799 .collect()
9800 } else {
9801 table_rows
9802 };
9803
9804 if body_rows.is_empty() {
9805 return None;
9806 }
9807 combined_rows.extend(body_rows);
9808
9809 let rendered = render_pipe_rows(&combined_rows);
9810 Some(GeometricTableRegion {
9811 start_idx: candidate_indices[0],
9812 end_idx: following_indices.last().copied().unwrap_or(table_idx),
9813 rendered,
9814 })
9815}
9816
9817fn table_column_ranges(table: &crate::models::table::TableBorder) -> Option<Vec<(f64, f64)>> {
9818 if table.num_columns == 0 {
9819 return None;
9820 }
9821
9822 let mut ranges = vec![(f64::INFINITY, f64::NEG_INFINITY); table.num_columns];
9823 for row in &table.rows {
9824 for cell in &row.cells {
9825 if cell.col_number >= table.num_columns {
9826 continue;
9827 }
9828 let range = &mut ranges[cell.col_number];
9829 range.0 = range.0.min(cell.bbox.left_x);
9830 range.1 = range.1.max(cell.bbox.right_x);
9831 }
9832 }
9833
9834 if ranges
9835 .iter()
9836 .any(|(left, right)| !left.is_finite() || !right.is_finite() || right <= left)
9837 {
9838 return None;
9839 }
9840
9841 Some(ranges)
9842}
9843
9844fn collect_table_header_candidate_indices(
9845 doc: &PdfDocument,
9846 table_idx: usize,
9847 table: &crate::models::table::TableBorder,
9848) -> Vec<usize> {
9849 let mut indices = Vec::new();
9850 let table_page = table.bbox.page_number;
9851 let table_top = table.bbox.top_y;
9852 let mut cursor = table_idx;
9853
9854 while let Some(prev_idx) = cursor.checked_sub(1) {
9855 let element = &doc.kids[prev_idx];
9856 if element.page_number() != table_page {
9857 break;
9858 }
9859 if !is_geometric_text_candidate(element) {
9860 break;
9861 }
9862
9863 let bbox = element.bbox();
9864 let vertical_gap = bbox.bottom_y - table_top;
9865 if !(-6.0..=260.0).contains(&vertical_gap) {
9866 break;
9867 }
9868
9869 indices.push(prev_idx);
9870 cursor = prev_idx;
9871 if indices.len() >= 10 {
9872 break;
9873 }
9874 }
9875
9876 indices.reverse();
9877 indices
9878}
9879
9880fn collect_table_footer_candidate_indices(
9881 doc: &PdfDocument,
9882 table_idx: usize,
9883 table: &crate::models::table::TableBorder,
9884) -> Vec<usize> {
9885 let mut indices = Vec::new();
9886 let table_page = table.bbox.page_number;
9887 let table_bottom = table.bbox.bottom_y;
9888
9889 for idx in table_idx + 1..doc.kids.len() {
9890 let element = &doc.kids[idx];
9891 if element.page_number() != table_page {
9892 break;
9893 }
9894 if !is_geometric_text_candidate(element) {
9895 break;
9896 }
9897 if looks_like_margin_page_number(doc, element, &extract_element_text(element)) {
9898 break;
9899 }
9900
9901 let bbox = element.bbox();
9902 let gap = table_bottom - bbox.top_y;
9903 if !(-6.0..=28.0).contains(&gap) {
9904 break;
9905 }
9906 indices.push(idx);
9907 if indices.len() >= 4 {
9908 break;
9909 }
9910 }
9911
9912 indices
9913}
9914
9915fn is_geometric_text_candidate(element: &ContentElement) -> bool {
9916 matches!(
9917 element,
9918 ContentElement::Paragraph(_)
9919 | ContentElement::Heading(_)
9920 | ContentElement::NumberHeading(_)
9921 | ContentElement::TextBlock(_)
9922 | ContentElement::TextLine(_)
9923 )
9924}
9925
9926fn infer_left_stub_requirement(
9927 doc: &PdfDocument,
9928 candidate_indices: &[usize],
9929 table_rows: &[Vec<String>],
9930 column_ranges: &[(f64, f64)],
9931) -> bool {
9932 if column_ranges.is_empty() {
9933 return false;
9934 }
9935
9936 let first_width = (column_ranges[0].1 - column_ranges[0].0).max(1.0);
9937 let has_left_label = candidate_indices.iter().any(|idx| {
9938 let bbox = doc.kids[*idx].bbox();
9939 bbox.right_x <= column_ranges[0].0 + first_width * 0.12
9940 && bbox.width() <= first_width * 0.45
9941 });
9942 if !has_left_label {
9943 return false;
9944 }
9945
9946 let mut first_col_word_counts: Vec<usize> = table_rows
9947 .iter()
9948 .filter_map(|row| row.first())
9949 .map(|cell| cell.split_whitespace().count())
9950 .collect();
9951 if first_col_word_counts.is_empty() {
9952 return false;
9953 }
9954 first_col_word_counts.sort_unstable();
9955 let median = first_col_word_counts[first_col_word_counts.len() / 2];
9956 median >= 5
9957}
9958
9959fn supports_embedded_stub_header(
9960 table_rows: &[Vec<String>],
9961 column_ranges: &[(f64, f64)],
9962 doc: &PdfDocument,
9963 candidate_indices: &[usize],
9964) -> bool {
9965 if table_rows.len() < 2 || column_ranges.len() < 3 {
9966 return false;
9967 }
9968
9969 let first_row = &table_rows[0];
9970 if first_row.len() != column_ranges.len() || first_row[0].trim().is_empty() {
9971 return false;
9972 }
9973 if first_row[0].split_whitespace().count() > 3 || first_row[0].trim().len() > 24 {
9974 return false;
9975 }
9976
9977 let data_fill = first_row
9978 .iter()
9979 .skip(1)
9980 .filter(|cell| !cell.trim().is_empty())
9981 .count();
9982 if data_fill + 1 < column_ranges.len() {
9983 return false;
9984 }
9985
9986 let labeled_rows = table_rows
9987 .iter()
9988 .skip(1)
9989 .filter(|row| row.first().is_some_and(|cell| !cell.trim().is_empty()))
9990 .count();
9991 if labeled_rows == 0 {
9992 return false;
9993 }
9994
9995 let slot_ranges = column_ranges.to_vec();
9996 let header_rows = reconstruct_aligned_rows(doc, candidate_indices, &slot_ranges, true, 2);
9997 header_rows.iter().any(|row| {
9998 row.first().is_none_or(|cell| cell.trim().is_empty())
9999 && row
10000 .iter()
10001 .skip(1)
10002 .filter(|cell| !cell.trim().is_empty())
10003 .count()
10004 >= column_ranges.len().saturating_sub(1)
10005 })
10006}
10007
10008fn slot_ranges(
10009 column_ranges: &[(f64, f64)],
10010 doc: &PdfDocument,
10011 candidate_indices: &[usize],
10012 needs_stub: bool,
10013) -> Option<Vec<(f64, f64)>> {
10014 let mut slots = Vec::new();
10015 if needs_stub {
10016 let first_left = column_ranges.first()?.0;
10017 let left_stub_start = candidate_indices
10018 .iter()
10019 .map(|idx| doc.kids[*idx].bbox().left_x)
10020 .fold(first_left, f64::min);
10021 let stub_right = first_left - 1.0;
10022 if stub_right <= left_stub_start {
10023 return None;
10024 }
10025 slots.push((left_stub_start, stub_right));
10026 }
10027 slots.extend(column_ranges.iter().copied());
10028 Some(slots)
10029}
10030
10031fn reconstruct_aligned_rows(
10032 doc: &PdfDocument,
10033 candidate_indices: &[usize],
10034 slot_ranges: &[(f64, f64)],
10035 drop_wide_singletons: bool,
10036 min_filled_slots: usize,
10037) -> Vec<Vec<String>> {
10038 if candidate_indices.is_empty() || slot_ranges.is_empty() {
10039 return Vec::new();
10040 }
10041
10042 let mut row_bands: Vec<(BoundingBox, Vec<String>)> = Vec::new();
10043
10044 for idx in candidate_indices {
10045 for line in extract_chunk_lines(&doc.kids[*idx]) {
10046 let fragments = split_line_into_slot_fragments(&line, slot_ranges);
10047 if fragments.is_empty() {
10048 continue;
10049 }
10050
10051 if drop_wide_singletons && fragments.len() == 1 {
10052 let only = &fragments[0];
10053 let span_width = only.bbox.width();
10054 let table_width =
10055 slot_ranges.last().map(|(_, right)| *right).unwrap_or(0.0) - slot_ranges[0].0;
10056 if span_width >= table_width * 0.55 {
10057 continue;
10058 }
10059 }
10060
10061 let line_center = line.bbox.center_y();
10062 let tolerance = line
10063 .chunks
10064 .iter()
10065 .map(|chunk| chunk.font_size)
10066 .fold(8.0, f64::max)
10067 * 0.8;
10068
10069 let mut target_row = None;
10070 for (row_idx, (bbox, _)) in row_bands.iter().enumerate() {
10071 if (bbox.center_y() - line_center).abs() <= tolerance {
10072 target_row = Some(row_idx);
10073 break;
10074 }
10075 }
10076
10077 if let Some(row_idx) = target_row {
10078 let (bbox, cells) = &mut row_bands[row_idx];
10079 *bbox = bbox.union(&line.bbox);
10080 for fragment in fragments {
10081 append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
10082 }
10083 } else {
10084 let mut cells = vec![String::new(); slot_ranges.len()];
10085 for fragment in fragments {
10086 append_cell_text(&mut cells[fragment.slot_idx], &fragment.text);
10087 }
10088 row_bands.push((line.bbox.clone(), cells));
10089 }
10090 }
10091 }
10092
10093 row_bands.sort_by(|left, right| {
10094 right
10095 .0
10096 .top_y
10097 .partial_cmp(&left.0.top_y)
10098 .unwrap_or(std::cmp::Ordering::Equal)
10099 });
10100
10101 row_bands
10102 .into_iter()
10103 .map(|(_, cells)| cells)
10104 .filter(|cells| {
10105 let filled = cells.iter().filter(|cell| !cell.trim().is_empty()).count();
10106 filled >= min_filled_slots
10107 })
10108 .collect()
10109}
10110
10111fn extract_chunk_lines(element: &ContentElement) -> Vec<ChunkLine> {
10112 match element {
10113 ContentElement::Paragraph(p) => chunk_lines_from_semantic_node(&p.base),
10114 ContentElement::Heading(h) => chunk_lines_from_semantic_node(&h.base.base),
10115 ContentElement::NumberHeading(nh) => chunk_lines_from_semantic_node(&nh.base.base.base),
10116 ContentElement::TextBlock(tb) => tb
10117 .text_lines
10118 .iter()
10119 .map(|line| ChunkLine {
10120 bbox: line.bbox.clone(),
10121 chunks: line.text_chunks.clone(),
10122 })
10123 .collect(),
10124 ContentElement::TextLine(tl) => vec![ChunkLine {
10125 bbox: tl.bbox.clone(),
10126 chunks: tl.text_chunks.clone(),
10127 }],
10128 _ => Vec::new(),
10129 }
10130}
10131
10132fn chunk_lines_from_semantic_node(node: &SemanticTextNode) -> Vec<ChunkLine> {
10133 let mut lines = Vec::new();
10134 for column in &node.columns {
10135 for block in &column.text_blocks {
10136 for line in &block.text_lines {
10137 lines.push(ChunkLine {
10138 bbox: line.bbox.clone(),
10139 chunks: line.text_chunks.clone(),
10140 });
10141 }
10142 }
10143 }
10144 lines
10145}
10146
10147fn split_line_into_slot_fragments(
10148 line: &ChunkLine,
10149 slot_ranges: &[(f64, f64)],
10150) -> Vec<SlotFragment> {
10151 let mut groups: Vec<(usize, Vec<TextChunk>, BoundingBox)> = Vec::new();
10152
10153 for chunk in line
10154 .chunks
10155 .iter()
10156 .filter(|chunk| !chunk.value.trim().is_empty())
10157 .cloned()
10158 {
10159 let slot_idx = assign_chunk_to_slot(&chunk.bbox, slot_ranges);
10160 if let Some((prev_slot, prev_chunks, prev_bbox)) = groups.last_mut() {
10161 let gap = chunk.bbox.left_x - prev_bbox.right_x;
10162 if *prev_slot == slot_idx && gap <= chunk.font_size.max(6.0) * 2.4 {
10163 *prev_bbox = prev_bbox.union(&chunk.bbox);
10164 prev_chunks.push(chunk);
10165 continue;
10166 }
10167 }
10168 groups.push((slot_idx, vec![chunk.clone()], chunk.bbox.clone()));
10169 }
10170
10171 groups
10172 .into_iter()
10173 .filter_map(|(slot_idx, chunks, bbox)| {
10174 let text = normalize_common_ocr_text(
10175 &crate::models::text::TextLine::concatenate_chunks(&chunks),
10176 );
10177 if text.trim().is_empty() {
10178 None
10179 } else {
10180 Some(SlotFragment {
10181 slot_idx,
10182 bbox,
10183 text,
10184 })
10185 }
10186 })
10187 .collect()
10188}
10189
10190fn assign_chunk_to_slot(bbox: &BoundingBox, slot_ranges: &[(f64, f64)]) -> usize {
10191 let mut best_idx = 0usize;
10192 let mut best_overlap = f64::NEG_INFINITY;
10193 let center_x = bbox.center_x();
10194
10195 for (idx, (left, right)) in slot_ranges.iter().enumerate() {
10196 let overlap = (bbox.right_x.min(*right) - bbox.left_x.max(*left)).max(0.0);
10197 let score = if overlap > 0.0 {
10198 overlap / bbox.width().max(1.0)
10199 } else {
10200 -((center_x - ((*left + *right) / 2.0)).abs())
10201 };
10202 if score > best_overlap {
10203 best_overlap = score;
10204 best_idx = idx;
10205 }
10206 }
10207
10208 best_idx
10209}
10210
10211fn append_cell_text(cell: &mut String, fragment: &str) {
10212 let trimmed = fragment.trim();
10213 if trimmed.is_empty() {
10214 return;
10215 }
10216 if !cell.is_empty() {
10217 cell.push(' ');
10218 }
10219 cell.push_str(trimmed);
10220}
10221
10222fn normalize_leading_stub_header(rows: &mut [Vec<String>]) {
10223 if rows.len() < 2 || rows[0].is_empty() || rows[1].is_empty() {
10224 return;
10225 }
10226
10227 if !rows[0][0].trim().is_empty() || rows[1][0].trim().is_empty() {
10228 return;
10229 }
10230
10231 let first_row_filled = rows[0]
10232 .iter()
10233 .skip(1)
10234 .filter(|cell| !cell.trim().is_empty())
10235 .count();
10236 let second_row_filled = rows[1]
10237 .iter()
10238 .skip(1)
10239 .filter(|cell| !cell.trim().is_empty())
10240 .count();
10241 if first_row_filled < 2 || second_row_filled < 2 {
10242 return;
10243 }
10244
10245 rows[0][0] = rows[1][0].trim().to_string();
10246 rows[1][0].clear();
10247}
10248
10249fn promote_embedded_stub_header(header_rows: &mut [Vec<String>], table_rows: &[Vec<String>]) {
10250 let Some(header_row) = header_rows.first_mut() else {
10251 return;
10252 };
10253 let Some(first_body_row) = table_rows.first() else {
10254 return;
10255 };
10256 if header_row.is_empty() || first_body_row.is_empty() {
10257 return;
10258 }
10259 if !header_row[0].trim().is_empty() {
10260 return;
10261 }
10262
10263 let promoted = first_body_row[0].trim();
10264 if promoted.is_empty() || promoted.split_whitespace().count() > 3 || promoted.len() > 24 {
10265 return;
10266 }
10267
10268 let header_fill = header_row
10269 .iter()
10270 .skip(1)
10271 .filter(|cell| !cell.trim().is_empty())
10272 .count();
10273 let body_fill = first_body_row
10274 .iter()
10275 .skip(1)
10276 .filter(|cell| !cell.trim().is_empty())
10277 .count();
10278 if header_fill < header_row.len().saturating_sub(1)
10279 || body_fill < first_body_row.len().saturating_sub(1)
10280 {
10281 return;
10282 }
10283
10284 header_row[0] = promoted.to_string();
10285}
10286
10287fn should_merge_panel_body_rows(rows: &[Vec<String>]) -> bool {
10288 rows.len() >= 3
10289 && rows
10290 .iter()
10291 .all(|row| !row.is_empty() && row.iter().all(|cell| !cell.trim().is_empty()))
10292}
10293
10294fn merge_panel_body_row(
10295 table_rows: &[Vec<String>],
10296 trailing_rows: &[Vec<String>],
10297 slot_count: usize,
10298) -> Vec<String> {
10299 let mut merged = vec![String::new(); slot_count];
10300 for row in table_rows {
10301 for (col_idx, cell) in row.iter().enumerate() {
10302 if col_idx + 1 >= slot_count {
10303 break;
10304 }
10305 append_cell_text(&mut merged[col_idx + 1], cell);
10306 }
10307 }
10308 for row in trailing_rows {
10309 for (col_idx, cell) in row.iter().enumerate() {
10310 if col_idx >= slot_count {
10311 break;
10312 }
10313 append_cell_text(&mut merged[col_idx], cell);
10314 }
10315 }
10316 merged
10317}
10318
10319fn render_pipe_rows(rows: &[Vec<String>]) -> String {
10320 if rows.is_empty() {
10321 return String::new();
10322 }
10323
10324 let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
10325 if num_cols == 0 {
10326 return String::new();
10327 }
10328
10329 let mut out = String::new();
10330 for (row_idx, row) in rows.iter().enumerate() {
10331 out.push('|');
10332 for col_idx in 0..num_cols {
10333 let cell = row.get(col_idx).map(String::as_str).unwrap_or("");
10334 out.push_str(&format!(" {} |", cell.trim()));
10335 }
10336 out.push('\n');
10337
10338 if row_idx == 0 {
10339 out.push('|');
10340 for _ in 0..num_cols {
10341 out.push_str(" --- |");
10342 }
10343 out.push('\n');
10344 }
10345 }
10346 out.push('\n');
10347 out
10348}
10349
10350fn render_html_table(rows: &[Vec<String>]) -> String {
10351 if rows.is_empty() {
10352 return String::new();
10353 }
10354
10355 let num_cols = rows.iter().map(Vec::len).max().unwrap_or(0);
10356 if num_cols == 0 {
10357 return String::new();
10358 }
10359
10360 let mut out = String::from("<table>\n");
10361 for (row_idx, row) in rows.iter().enumerate() {
10362 out.push_str("<tr>");
10363 for col_idx in 0..num_cols {
10364 let cell = escape_html_text(row.get(col_idx).map(String::as_str).unwrap_or("").trim());
10365 if row_idx == 0 {
10366 out.push_str("<th>");
10367 out.push_str(&cell);
10368 out.push_str("</th>");
10369 } else {
10370 out.push_str("<td>");
10371 out.push_str(&cell);
10372 out.push_str("</td>");
10373 }
10374 }
10375 out.push_str("</tr>\n");
10376 }
10377 out.push_str("</table>\n\n");
10378 out
10379}
10380
10381fn escape_html_text(text: &str) -> String {
10382 text.replace('&', "&")
10383 .replace('<', "<")
10384 .replace('>', ">")
10385 .replace('"', """)
10386 .replace('\'', "'")
10387}
10388
10389fn normalized_numeric_marker(text: &str) -> Option<String> {
10390 let digits = text
10391 .chars()
10392 .filter(|ch| ch.is_ascii_digit())
10393 .collect::<String>();
10394 (!digits.is_empty() && digits.len() <= 2).then_some(digits)
10395}
10396
10397fn render_infographic_card_rows(rows: &[Vec<String>]) -> Option<String> {
10398 if rows.is_empty() || !rows.iter().all(|row| row.len() == 2) {
10399 return None;
10400 }
10401
10402 let marker = normalized_numeric_marker(rows[0][0].trim())?;
10403 if rows[0][1].split_whitespace().count() < 4 {
10404 return None;
10405 }
10406 if rows
10407 .iter()
10408 .skip(1)
10409 .any(|row| normalized_numeric_marker(row[0].trim()).is_some())
10410 {
10411 return None;
10412 }
10413 if rows
10414 .iter()
10415 .skip(1)
10416 .any(|row| !row[0].trim().is_empty() && row[0].trim().len() > 2)
10417 {
10418 return None;
10419 }
10420
10421 let body = rows
10422 .iter()
10423 .filter_map(|row| row.get(1))
10424 .map(|cell| cell.trim())
10425 .filter(|cell| !cell.is_empty())
10426 .collect::<Vec<_>>()
10427 .join(" ");
10428 if body.split_whitespace().count() < 8 {
10429 return None;
10430 }
10431
10432 Some(format!("{marker}. {body}\n\n"))
10433}
10434
10435fn extract_element_text(element: &ContentElement) -> String {
10436 match element {
10437 ContentElement::Paragraph(p) => clean_paragraph_text(&p.base.value()),
10438 ContentElement::Heading(h) => clean_paragraph_text(&h.base.base.value()),
10439 ContentElement::NumberHeading(nh) => clean_paragraph_text(&nh.base.base.base.value()),
10440 ContentElement::TextBlock(tb) => clean_paragraph_text(&tb.value()),
10441 ContentElement::TextLine(tl) => clean_paragraph_text(&tl.value()),
10442 _ => String::new(),
10443 }
10444}
10445
10446fn collect_table_border_rows(table: &crate::models::table::TableBorder) -> Vec<Vec<String>> {
10448 let num_cols = table.num_columns.max(1);
10449 let mut rendered_rows: Vec<Vec<String>> = Vec::new();
10450 for row in &table.rows {
10451 let cell_texts: Vec<String> = (0..num_cols)
10452 .map(|col| {
10453 row.cells
10454 .iter()
10455 .find(|c| c.col_number == col)
10456 .map(cell_text_content)
10457 .unwrap_or_default()
10458 })
10459 .collect();
10460 if !cell_texts.iter().all(|t| t.trim().is_empty()) {
10461 rendered_rows.push(cell_texts);
10462 }
10463 }
10464 rendered_rows
10465}
10466
10467fn render_table_border(out: &mut String, table: &crate::models::table::TableBorder) {
10473 if table.rows.is_empty() {
10474 return;
10475 }
10476
10477 let mut rendered_rows = collect_table_border_rows(table);
10479
10480 if rendered_rows.is_empty() {
10481 return;
10482 }
10483
10484 if let Some(rendered) = render_infographic_card_rows(&rendered_rows) {
10485 out.push_str(&rendered);
10486 return;
10487 }
10488
10489 merge_continuation_rows(&mut rendered_rows);
10491 trim_leading_table_carryover_rows(&mut rendered_rows);
10492
10493 if is_toc_table(&rendered_rows) {
10495 render_toc_rows(out, &rendered_rows);
10496 return;
10497 }
10498
10499 out.push_str(&render_pipe_rows(&rendered_rows));
10500}
10501
10502fn is_page_number_like(text: &str) -> bool {
10504 let t = text.trim();
10505 if t.is_empty() {
10506 return false;
10507 }
10508 if t.len() <= 5 && t.chars().all(|c| c.is_ascii_digit()) {
10510 return true;
10511 }
10512 let lower = t.to_ascii_lowercase();
10514 if lower.len() <= 10 && lower.chars().all(|c| "ivxlcdm".contains(c)) {
10515 return true;
10516 }
10517 false
10518}
10519
10520fn is_toc_table(rows: &[Vec<String>]) -> bool {
10523 if rows.is_empty() {
10524 return false;
10525 }
10526 if rows.len() < 2 {
10528 return false;
10529 }
10530 if !rows.iter().all(|r| r.len() == 2) {
10532 return false;
10533 }
10534
10535 let non_empty_right = rows.iter().filter(|r| !r[1].trim().is_empty()).count();
10536 if non_empty_right < 2 {
10537 return false;
10538 }
10539
10540 let page_like = rows.iter().filter(|r| is_page_number_like(&r[1])).count();
10541 page_like >= 2 && page_like * 10 >= non_empty_right * 9 && page_like * 2 >= rows.len()
10542}
10543
10544fn render_toc_rows(out: &mut String, rows: &[Vec<String>]) {
10546 for row in rows {
10547 let title = row[0].trim();
10548 let page = row[1].trim();
10549 if title.is_empty() && page.is_empty() {
10550 continue;
10551 }
10552 if !title.is_empty() && !page.is_empty() {
10553 out.push_str(title);
10554 out.push(' ');
10555 out.push_str(page);
10556 } else {
10557 out.push_str(title);
10558 out.push_str(page);
10559 }
10560 out.push('\n');
10561 }
10562 out.push('\n');
10563}
10564
10565fn cell_text_content(cell: &crate::models::table::TableBorderCell) -> String {
10567 if !cell.content.is_empty() {
10571 let chunks: Vec<_> = cell.content.iter().map(|t| t.base.clone()).collect();
10572 return normalize_common_ocr_text(&crate::models::text::TextLine::concatenate_chunks(
10573 &chunks,
10574 ));
10575 }
10576 let mut text = String::new();
10578 for elem in &cell.contents {
10579 match elem {
10580 ContentElement::Paragraph(p) => text.push_str(&p.base.value()),
10581 ContentElement::TextBlock(tb) => text.push_str(&tb.value()),
10582 ContentElement::TextLine(tl) => text.push_str(&tl.value()),
10583 ContentElement::TextChunk(tc) => text.push_str(&tc.value),
10584 _ => {}
10585 }
10586 }
10587 normalize_common_ocr_text(&repair_fragmented_words(&text))
10588}
10589
10590fn merge_adjacent_pipe_tables(markdown: &str) -> String {
10598 let lines: Vec<&str> = markdown.lines().collect();
10599 if lines.len() < 4 {
10600 return markdown.to_string();
10601 }
10602
10603 fn count_pipe_cols(line: &str) -> usize {
10604 let t = line.trim();
10605 if !t.starts_with('|') || !t.ends_with('|') {
10606 return 0;
10607 }
10608 t.split('|').count().saturating_sub(2)
10609 }
10610
10611 fn is_separator(line: &str) -> bool {
10612 let t = line.trim();
10613 if !t.starts_with('|') || !t.ends_with('|') {
10614 return false;
10615 }
10616 let cells: Vec<&str> = t.split('|').collect();
10617 if cells.len() < 3 {
10618 return false;
10619 }
10620 cells[1..cells.len() - 1].iter().all(|c| {
10621 let s = c.trim();
10622 !s.is_empty() && s.chars().all(|ch| ch == '-' || ch == ':')
10623 })
10624 }
10625
10626 fn is_pipe_row(line: &str) -> bool {
10627 let t = line.trim();
10628 t.starts_with('|') && t.ends_with('|') && t.len() > 2
10629 }
10630
10631 fn pipe_cells(line: &str) -> Vec<String> {
10632 let t = line.trim();
10633 if !is_pipe_row(t) {
10634 return Vec::new();
10635 }
10636 let parts = t.split('|').collect::<Vec<_>>();
10637 parts[1..parts.len() - 1]
10638 .iter()
10639 .map(|cell| cell.trim().to_string())
10640 .collect()
10641 }
10642
10643 fn normalize_header_cell(cell: &str) -> String {
10644 cell.chars()
10645 .filter(|ch| ch.is_alphanumeric())
10646 .flat_map(|ch| ch.to_lowercase())
10647 .collect()
10648 }
10649
10650 fn looks_like_header_row(line: &str) -> bool {
10651 let cells = pipe_cells(line);
10652 if cells.len() < 2 {
10653 return false;
10654 }
10655
10656 let non_empty = cells
10657 .iter()
10658 .filter(|cell| !cell.trim().is_empty())
10659 .collect::<Vec<_>>();
10660 if non_empty.len() < 2 {
10661 return false;
10662 }
10663
10664 let headerish = non_empty.iter().all(|cell| {
10665 let trimmed = cell.trim();
10666 let word_count = trimmed.split_whitespace().count();
10667 let has_alpha = trimmed.chars().any(|ch| ch.is_alphabetic());
10668 has_alpha && word_count <= 4 && trimmed.len() <= 28
10669 });
10670 headerish
10671 }
10672
10673 fn header_overlap_ratio(left: &str, right: &str) -> f64 {
10674 let left_cells = pipe_cells(left)
10675 .into_iter()
10676 .map(|cell| normalize_header_cell(&cell))
10677 .collect::<Vec<_>>();
10678 let right_cells = pipe_cells(right)
10679 .into_iter()
10680 .map(|cell| normalize_header_cell(&cell))
10681 .collect::<Vec<_>>();
10682 let width = left_cells.len().min(right_cells.len());
10683 if width == 0 {
10684 return 0.0;
10685 }
10686
10687 let matches = (0..width)
10688 .filter(|idx| {
10689 !left_cells[*idx].is_empty()
10690 && !right_cells[*idx].is_empty()
10691 && left_cells[*idx] == right_cells[*idx]
10692 })
10693 .count();
10694 matches as f64 / width as f64
10695 }
10696
10697 fn header_schema_matches(left: &str, right: &str) -> bool {
10698 let left_cells = pipe_cells(left)
10699 .into_iter()
10700 .map(|cell| normalize_header_cell(&cell))
10701 .collect::<Vec<_>>();
10702 let right_cells = pipe_cells(right)
10703 .into_iter()
10704 .map(|cell| normalize_header_cell(&cell))
10705 .collect::<Vec<_>>();
10706 if left_cells.len() != right_cells.len() || left_cells.len() < 2 {
10707 return false;
10708 }
10709
10710 let mut aligned_non_empty = 0usize;
10711 for (left, right) in left_cells.iter().zip(right_cells.iter()) {
10712 if left.is_empty() || right.is_empty() {
10713 continue;
10714 }
10715 aligned_non_empty += 1;
10716 if left != right {
10717 return false;
10718 }
10719 }
10720
10721 aligned_non_empty >= 2
10722 }
10723
10724 fn pad_pipe_row(line: &str, target_cols: usize) -> String {
10725 let t = line.trim();
10726 let current_cols = count_pipe_cols(t);
10727 if current_cols >= target_cols {
10728 return t.to_string();
10729 }
10730 let mut result = t.to_string();
10732 for _ in current_cols..target_cols {
10733 result.push_str(" |");
10734 }
10735 result
10736 }
10737
10738 struct Block {
10740 start: usize,
10741 sep: usize,
10742 end: usize, cols: usize,
10744 }
10745
10746 let mut blocks: Vec<Block> = Vec::new();
10747 let mut i = 0;
10748 while i < lines.len() {
10749 if i + 1 < lines.len() && is_pipe_row(lines[i]) && is_separator(lines[i + 1]) {
10750 let cols = count_pipe_cols(lines[i]);
10751 let sep = i + 1;
10752 let mut end = sep;
10753 let mut j = sep + 1;
10754 while j < lines.len() && is_pipe_row(lines[j]) && !is_separator(lines[j]) {
10755 end = j;
10756 j += 1;
10757 }
10758 blocks.push(Block {
10759 start: i,
10760 sep,
10761 end,
10762 cols,
10763 });
10764 i = end + 1;
10765 } else {
10766 i += 1;
10767 }
10768 }
10769
10770 if blocks.len() < 2 {
10771 return markdown.to_string();
10772 }
10773
10774 let mut merge_leader: Vec<Option<usize>> = vec![None; blocks.len()];
10780 let mut group_cols: Vec<usize> = blocks.iter().map(|b| b.cols).collect();
10781 for bi in 1..blocks.len() {
10782 let prev = &blocks[bi - 1];
10783 let curr = &blocks[bi];
10784 let gap_range = prev.end + 1..curr.start;
10785 let gap_all_blank = gap_range.clone().all(|li| lines[li].trim().is_empty());
10786 let leader_idx = merge_leader[bi - 1].unwrap_or(bi - 1);
10790 let effective_prev_cols = group_cols[leader_idx];
10791 let gap_heading_only = if !gap_all_blank && effective_prev_cols >= 2 && curr.cols >= 2 {
10792 let non_blank: Vec<usize> = gap_range
10793 .clone()
10794 .filter(|li| !lines[*li].trim().is_empty())
10795 .collect();
10796 !non_blank.is_empty()
10798 && non_blank.len() <= 2
10799 && non_blank.iter().all(|li| {
10800 let t = lines[*li].trim();
10801 t.starts_with('#') && t.len() < 100
10802 })
10803 } else {
10804 false
10805 };
10806 let gap_short_fragment =
10810 if !gap_all_blank && !gap_heading_only && effective_prev_cols >= 2 && curr.cols >= 2 {
10811 let non_blank: Vec<usize> = gap_range
10812 .clone()
10813 .filter(|li| !lines[*li].trim().is_empty())
10814 .collect();
10815 non_blank.len() == 1 && {
10816 let t = lines[non_blank[0]].trim();
10817 t.len() < 30
10818 && !t.starts_with('#')
10819 && !t.starts_with('-')
10820 && !t.starts_with('*')
10821 && !t.contains(':')
10822 && !t.contains("TABLE")
10823 }
10824 } else {
10825 false
10826 };
10827 let prev_has_header = looks_like_header_row(lines[prev.start]);
10828 let curr_has_header = curr.end >= curr.sep + 2 && looks_like_header_row(lines[curr.start]);
10829 let curr_has_distinct_header = prev_has_header
10830 && curr_has_header
10831 && !header_schema_matches(lines[prev.start], lines[curr.start])
10832 && (curr.cols != prev.cols
10833 || header_overlap_ratio(lines[prev.start], lines[curr.start]) < 1.0);
10834
10835 if (gap_all_blank || gap_heading_only || gap_short_fragment)
10836 && prev.cols > 0
10837 && curr.cols > 0
10838 && !curr_has_distinct_header
10839 {
10840 merge_leader[bi] = Some(leader_idx);
10841 if curr.cols > group_cols[leader_idx] {
10843 group_cols[leader_idx] = curr.cols;
10844 }
10845 }
10846 }
10847
10848 let mut pad_target: Vec<usize> = vec![0; blocks.len()];
10849 for bi in 0..blocks.len() {
10850 let leader = merge_leader[bi].unwrap_or(bi);
10851 pad_target[bi] = group_cols[leader];
10852 }
10853
10854 let mut skip = vec![false; lines.len()];
10858 let mut convert_to_pipe_row = vec![false; lines.len()];
10859 for (bi, leader) in merge_leader.iter().enumerate() {
10860 if leader.is_none() {
10861 continue;
10862 }
10863 let prev_end = blocks[bi - 1].end;
10864 let curr = &blocks[bi];
10865 for li in (prev_end + 1)..curr.start {
10866 if lines[li].trim().is_empty() {
10867 skip[li] = true;
10868 } else {
10869 convert_to_pipe_row[li] = true;
10871 }
10872 }
10873 skip[curr.sep] = true;
10875 }
10876
10877 let mut line_to_block: Vec<Option<usize>> = vec![None; lines.len()];
10879 for (bi, block) in blocks.iter().enumerate() {
10880 line_to_block[block.start..=block.end].fill(Some(bi));
10881 }
10882 for (bi, leader) in merge_leader.iter().enumerate() {
10884 if leader.is_none() {
10885 continue;
10886 }
10887 let prev_end = blocks[bi - 1].end;
10888 let curr = &blocks[bi];
10889 for li in (prev_end + 1)..curr.start {
10890 if convert_to_pipe_row[li] {
10891 line_to_block[li] = Some(bi - 1);
10892 }
10893 }
10894 }
10895
10896 let mut result = String::new();
10897 for (li, line) in lines.iter().enumerate() {
10898 if skip[li] {
10899 continue;
10900 }
10901 if convert_to_pipe_row[li] {
10902 let text = line.trim().trim_start_matches('#').trim();
10904 if let Some(bi) = line_to_block[li] {
10905 let target = pad_target[bi];
10906 if target > 0 && !text.is_empty() {
10907 result.push_str(&format!("| {} ", text));
10908 for _ in 1..target {
10909 result.push_str("| ");
10910 }
10911 result.push_str("|\n");
10912 continue;
10913 }
10914 }
10915 result.push_str(line);
10917 result.push('\n');
10918 continue;
10919 }
10920 if let Some(bi) = line_to_block[li] {
10921 let target = pad_target[bi];
10922 if target > 0 && is_pipe_row(line) && !is_separator(line) {
10923 result.push_str(&pad_pipe_row(line, target));
10924 result.push('\n');
10925 } else if target > 0 && is_separator(line) {
10926 result.push('|');
10927 for _ in 0..target {
10928 result.push_str(" --- |");
10929 }
10930 result.push('\n');
10931 } else {
10932 result.push_str(line);
10933 result.push('\n');
10934 }
10935 } else {
10936 result.push_str(line);
10937 result.push('\n');
10938 }
10939 }
10940
10941 result
10942}
10943
10944#[cfg(test)]
10945mod tests {
10946 use super::*;
10947 use crate::models::bbox::BoundingBox;
10948 use crate::models::chunks::TextChunk;
10949 use crate::models::content::ContentElement;
10950 use crate::models::enums::{PdfLayer, TextFormat, TextType};
10951 use crate::models::list::{ListBody, ListItem, ListLabel, PDFList};
10952 use crate::models::semantic::{SemanticHeading, SemanticParagraph, SemanticTextNode};
10953 use crate::models::table::{
10954 TableBorder, TableBorderCell, TableBorderRow, TableToken, TableTokenType,
10955 };
10956 use crate::models::text::{TextBlock, TextColumn, TextLine};
10957
10958 #[test]
10959 fn test_empty_doc() {
10960 let doc = PdfDocument::new("test.pdf".to_string());
10961 let md = to_markdown(&doc).unwrap();
10962 assert!(md.contains("No content extracted"));
10963 }
10964
10965 #[test]
10966 fn test_with_title() {
10967 let mut doc = PdfDocument::new("test.pdf".to_string());
10968 doc.title = Some("My Title".to_string());
10969 let md = to_markdown(&doc).unwrap();
10970 assert!(md.starts_with("# My Title\n"));
10971 }
10972
10973 #[test]
10974 fn test_empty_title_not_rendered() {
10975 let mut doc = PdfDocument::new("test.pdf".to_string());
10976 doc.title = Some(" ".to_string());
10977 let md = to_markdown(&doc).unwrap();
10978 assert!(
10979 !md.contains("# "),
10980 "Empty/whitespace title should not produce a heading"
10981 );
10982 }
10983
10984 #[test]
10985 fn test_repair_fragmented_words() {
10986 assert_eq!(
10987 repair_fragmented_words("Jurisdic tion Fore ign Req uire me nts"),
10988 "Jurisdiction Foreign Requirements"
10989 );
10990 }
10991
10992 #[test]
10993 fn test_normalize_common_ocr_text_repairs_units() {
10994 assert_eq!(
10995 normalize_common_ocr_text("10 ߤL at 37 C and -20 oC"),
10996 "10 μL at 37°C and -20°C"
10997 );
10998 }
10999
11000 #[cfg(not(target_arch = "wasm32"))]
11001 #[test]
11002 fn test_build_layout_anchor_rows_reconstructs_four_column_matrix() {
11003 let lines = vec![
11004 "Key Functions by Main Service Flow".to_string(),
11005 "".to_string(),
11006 " Service Stage Function Name Explanation Expected Benefit".to_string(),
11007 "".to_string(),
11008 " 1. Project creation Project creation and Select document type to automatically run project creation, Pipeline configuration with The intuitive UI environment allows the the person in charge to quickly proceed with".to_string(),
11009 "".to_string(),
11010 " management recommended Modelset and Endpoint deployment the entire process from project creation to deployment, improving work efficiency".to_string(),
11011 "".to_string(),
11012 " Conveniently manage raw data to be used for OCR Pack and actual date from live".to_string(),
11013 " 2. Data labeling and Data storage management Provides convenient functions for uploading raw data, viewer, and data management".to_string(),
11014 " (search using image metadata, sorting, filtering, hashtags settings on image data) service".to_string(),
11015 " fine-tuning".to_string(),
11016 " Image data bookmark for Qualitative Evaluation".to_string(),
11017 "".to_string(),
11018 " Create and manage Labeling Creating a Labeling Space to manage raw data annotation, managing labeling resources Labeling work can be outsourced within the pack. Labeled data is continuously".to_string(),
11019 " (Ontology, Characters to be Recognized), data set dump, data set version management supplied from which data sets can be created with ease. The Auto Labeling function".to_string(),
11020 " Space".to_string(),
11021 " 3 increases both efficiency and convenience.".to_string(),
11022 " Various basic models for each selected 5".to_string(),
11023 " document, information comparison between".to_string(),
11024 " Model training Providing a foundation for customers to implement, manage, and upgrade their own".to_string(),
11025 " models, basic model training, training pause function, re-training, cancel function, and OCR model specialized to the customers’ needs".to_string(),
11026 " configuration support for Characters to be Recognized and Ontology that is frequently".to_string(),
11027 " modified while developing specialized models".to_string(),
11028 ];
11029
11030 let header = find_layout_header_candidate(&lines).unwrap();
11031 let rows =
11032 build_layout_anchor_rows(&lines, &extract_layout_entries(&lines, &header)).unwrap();
11033
11034 assert_eq!(
11035 header.headers,
11036 vec![
11037 "Service Stage".to_string(),
11038 "Function Name".to_string(),
11039 "Explanation".to_string(),
11040 "Expected Benefit".to_string()
11041 ]
11042 );
11043 assert_eq!(rows.len(), 4);
11044 assert_eq!(rows[0][0], "1. Project creation");
11045 assert_eq!(rows[0][1], "Project creation and management");
11046 assert!(rows[1][0].contains("fine-tuning"));
11047 assert_eq!(rows[2][1], "Create and manage Labeling Space");
11048 assert_eq!(rows[3][1], "Model training");
11049 assert!(rows[3][2].contains("Various basic models for each selected document"));
11050 }
11051
11052 #[cfg(not(target_arch = "wasm32"))]
11053 #[test]
11054 fn test_build_layout_panel_stub_rows_reconstructs_left_stub_table() {
11055 let lines = vec![
11056 "AI Pack".to_string(),
11057 "Upstage offers 3 AI packs that process unstructured information and data".to_string(),
11058 "".to_string(),
11059 " OCR Recommendation Product semantic search".to_string(),
11060 "".to_string(),
11061 " A solution that recognizes characters in an A solution that recommends the best products and A solution that enables semantic search, analyzes and".to_string(),
11062 " image and extracts necessary information contents organizes key information in unstructured text data".to_string(),
11063 " Pack".to_string(),
11064 " into a standardized form (DB)".to_string(),
11065 "".to_string(),
11066 " Applicable to all fields that require text extraction Applicable to all fields that use any form of Applicable to all fields that deal with various types of".to_string(),
11067 " from standardized documents, such as receipts, recommendation including alternative products, unstructured data containing text information that".to_string(),
11068 "Application bills, credit cards, ID cards, certificates, and medical products and contents that are likely to be require semantic search and conversion into a DB".to_string(),
11069 " receipts purchased next".to_string(),
11070 "".to_string(),
11071 " Achieved 1st place in the OCR World Competition Team with specialists and technologies that Creation of the first natural language evaluation".to_string(),
11072 " The team includes specialists who have received Kaggle’s Gold Medal recommendation system in Korean (KLUE)".to_string(),
11073 " presented 14 papers in the world’s most (Education platform) World’s No.1 in Kaggle text embedding competition in".to_string(),
11074 " Highlight".to_string(),
11075 " renowned AI conferences Proven superior performance of more than 170% E-commerce subject (Shopee)".to_string(),
11076 " compared to other global top-tier recommendation".to_string(),
11077 " models".to_string(),
11078 ];
11079
11080 let header = find_layout_panel_header_candidate(&lines).unwrap();
11081 let rows = build_layout_panel_stub_rows(&lines, &header).unwrap();
11082
11083 assert_eq!(
11084 header.headers,
11085 vec![
11086 "OCR".to_string(),
11087 "Recommendation".to_string(),
11088 "Product semantic search".to_string()
11089 ]
11090 );
11091 assert_eq!(rows.len(), 3);
11092 assert_eq!(rows[0][0], "Pack");
11093 assert!(rows[0][1].contains("image and extracts necessary information"));
11094 assert_eq!(rows[1][0], "Application");
11095 assert!(rows[1][3].contains("require semantic search and conversion into a DB"));
11096 assert_eq!(rows[2][0], "Highlight");
11097 assert!(rows[2][2].contains("top-tier recommendation models"));
11098 }
11099
11100 #[cfg(not(target_arch = "wasm32"))]
11101 #[test]
11102 fn test_extract_layout_toc_entries_merges_wrapped_entry() {
11103 let lines = vec![
11104 "Table of Contents".to_string(),
11105 "".to_string(),
11106 "Executive Summary 4".to_string(),
11107 "Legal Framework 6".to_string(),
11108 "Election Administration 11".to_string(),
11109 "Civil Society Engagement 15".to_string(),
11110 "Political Parties, Candidates Registration and Election 18".to_string(),
11111 "Campaign".to_string(),
11112 "Media Freedom and Access to Information 25".to_string(),
11113 "Voter Education and Awareness 29".to_string(),
11114 "Participation of Marginalized Sectors 31".to_string(),
11115 "Recommendations 39".to_string(),
11116 ];
11117
11118 let (title, entries) = extract_layout_toc_entries(&lines).unwrap();
11119 assert_eq!(title, "Table of Contents");
11120 assert_eq!(entries.len(), 9);
11121 assert_eq!(entries[0].title, "Executive Summary");
11122 assert_eq!(entries[0].page, "4");
11123 assert_eq!(
11124 entries[4].title,
11125 "Political Parties, Candidates Registration and Election Campaign"
11126 );
11127 assert_eq!(entries[4].page, "18");
11128 }
11129
11130 #[cfg(not(target_arch = "wasm32"))]
11131 fn make_bbox_layout_line(words: &[(&str, f64, f64)], bottom: f64, top: f64) -> BBoxLayoutLine {
11132 make_bbox_layout_line_in_block(0, words, bottom, top)
11133 }
11134
11135 #[cfg(not(target_arch = "wasm32"))]
11136 fn make_bbox_layout_line_in_block(
11137 block_id: usize,
11138 words: &[(&str, f64, f64)],
11139 bottom: f64,
11140 top: f64,
11141 ) -> BBoxLayoutLine {
11142 BBoxLayoutLine {
11143 block_id,
11144 bbox: BoundingBox::new(
11145 Some(1),
11146 words.first().map(|(_, left, _)| *left).unwrap_or(72.0),
11147 bottom,
11148 words.last().map(|(_, _, right)| *right).unwrap_or(320.0),
11149 top,
11150 ),
11151 words: words
11152 .iter()
11153 .map(|(text, left, right)| BBoxLayoutWord {
11154 bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
11155 text: (*text).to_string(),
11156 })
11157 .collect(),
11158 }
11159 }
11160
11161 #[cfg(not(target_arch = "wasm32"))]
11162 #[test]
11163 fn test_detect_layout_open_plate_recovers_two_column_species_rows() {
11164 let lines = vec![
11165 make_bbox_layout_line(
11166 &[
11167 ("Fish", 60.0, 76.0),
11168 ("species", 78.0, 107.0),
11169 ("on", 109.0, 119.0),
11170 ("IUCN", 121.0, 142.0),
11171 ("Red", 144.0, 159.0),
11172 ("List", 161.0, 176.0),
11173 ],
11174 649.0,
11175 660.0,
11176 ),
11177 make_bbox_layout_line(
11178 &[("Potosi", 60.0, 84.0), ("Pupfish", 86.0, 114.0)],
11179 632.0,
11180 643.0,
11181 ),
11182 make_bbox_layout_line(
11183 &[("Cyprinodon", 132.0, 176.0), ("alvarezi", 178.0, 207.0)],
11184 632.0,
11185 643.0,
11186 ),
11187 make_bbox_layout_line(
11188 &[
11189 ("La", 60.0, 69.0),
11190 ("Palma", 71.0, 94.0),
11191 ("Pupfish", 96.0, 124.0),
11192 ("Cyprinodon", 132.0, 176.0),
11193 ("longidorsalis", 178.0, 224.0),
11194 ],
11195 616.0,
11196 627.0,
11197 ),
11198 make_bbox_layout_line(
11199 &[("Butterfly", 60.0, 94.0), ("Splitfin", 96.0, 123.0)],
11200 600.0,
11201 611.0,
11202 ),
11203 make_bbox_layout_line(
11204 &[("Ameca", 132.0, 156.0), ("splendens", 158.0, 194.0)],
11205 600.0,
11206 611.0,
11207 ),
11208 make_bbox_layout_line(
11209 &[("Golden", 60.0, 88.0), ("Skiffia", 90.0, 113.0)],
11210 584.0,
11211 595.0,
11212 ),
11213 make_bbox_layout_line(
11214 &[("Skiffia", 132.0, 155.0), ("francesae", 158.0, 193.0)],
11215 584.0,
11216 595.0,
11217 ),
11218 make_bbox_layout_line(
11219 &[
11220 ("Table", 56.0, 74.0),
11221 ("6.1:", 76.0, 87.0),
11222 ("Four", 89.0, 105.0),
11223 ("fish", 107.0, 119.0),
11224 ("species", 121.0, 145.0),
11225 ("on", 147.0, 155.0),
11226 ("IUCN", 157.0, 176.0),
11227 ("Red", 178.0, 190.0),
11228 ("List", 192.0, 205.0),
11229 ("held", 279.0, 293.0),
11230 ("in", 295.0, 302.0),
11231 ("public", 304.0, 325.0),
11232 ("aquariums.", 327.0, 365.0),
11233 ],
11234 556.0,
11235 566.0,
11236 ),
11237 ];
11238
11239 let plate = detect_layout_open_plate(576.0, &lines).unwrap();
11240 assert_eq!(plate.heading, "Fish species on IUCN Red List");
11241 assert_eq!(
11242 plate.header_row,
11243 vec![
11244 "Fish species on IUCN Red List".to_string(),
11245 "Scientific name".to_string()
11246 ]
11247 );
11248 assert_eq!(plate.rows.len(), 4);
11249 assert_eq!(
11250 plate.rows[1],
11251 vec![
11252 "La Palma Pupfish".to_string(),
11253 "Cyprinodon longidorsalis".to_string()
11254 ]
11255 );
11256 assert!(plate
11257 .caption
11258 .starts_with("Table 6.1: Four fish species on IUCN Red List"));
11259 }
11260
11261 #[cfg(not(target_arch = "wasm32"))]
11262 #[test]
11263 fn test_extract_layout_narrative_bridge_recovers_left_prose_and_defers_captions() {
11264 let plate = OpenPlateCandidate {
11265 heading: "Fish species on IUCN Red List".to_string(),
11266 header_row: vec![
11267 "Fish species on IUCN Red List".to_string(),
11268 "Scientific name".to_string(),
11269 ],
11270 rows: vec![],
11271 caption: "Table 6.1".to_string(),
11272 cutoff_top_y: 560.0,
11273 };
11274 let lines = vec![
11275 make_bbox_layout_line(
11276 &[
11277 ("Public", 56.0, 83.0),
11278 ("aquariums,", 88.0, 135.0),
11279 ("because", 140.0, 174.0),
11280 ],
11281 509.0,
11282 521.0,
11283 ),
11284 make_bbox_layout_line(
11285 &[
11286 ("of", 180.0, 188.0),
11287 ("their", 194.0, 214.0),
11288 ("in-", 220.0, 233.0),
11289 ],
11290 509.0,
11291 521.0,
11292 ),
11293 make_bbox_layout_line(
11294 &[
11295 ("house", 56.0, 82.0),
11296 ("expertise,", 84.0, 125.0),
11297 ("can", 128.0, 143.0),
11298 ],
11299 495.0,
11300 507.0,
11301 ),
11302 make_bbox_layout_line(
11303 &[("act", 146.0, 159.0), ("quickly", 161.0, 191.0)],
11304 495.0,
11305 507.0,
11306 ),
11307 make_bbox_layout_line_in_block(
11308 1,
11309 &[
11310 ("Figure", 242.0, 265.0),
11311 ("6.3:", 267.0, 280.0),
11312 ("Photo", 282.0, 303.0),
11313 ],
11314 355.0,
11315 366.0,
11316 ),
11317 make_bbox_layout_line_in_block(
11318 1,
11319 &[
11320 ("of", 305.0, 312.0),
11321 ("the", 314.0, 325.0),
11322 ("species.", 327.0, 360.0),
11323 ],
11324 355.0,
11325 366.0,
11326 ),
11327 make_bbox_layout_line(
11328 &[
11329 ("The", 56.0, 73.0),
11330 ("breeding", 77.0, 114.0),
11331 ("colonies", 118.0, 153.0),
11332 ],
11333 330.0,
11334 342.0,
11335 ),
11336 make_bbox_layout_line(
11337 &[
11338 ("of", 157.0, 165.0),
11339 ("the", 169.0, 183.0),
11340 ("Butterfly", 187.0, 224.0),
11341 ("Splitfin", 228.0, 258.0),
11342 ("at", 314.0, 323.0),
11343 ("the", 327.0, 341.0),
11344 ("London", 345.0, 377.0),
11345 ("Zoo", 381.0, 397.0),
11346 ("and", 401.0, 416.0),
11347 ("elsewhere", 420.0, 463.0),
11348 ("serve", 467.0, 489.0),
11349 ("as", 493.0, 502.0),
11350 ("ark", 506.0, 519.0),
11351 ],
11352 330.0,
11353 342.0,
11354 ),
11355 make_bbox_layout_line(
11356 &[
11357 ("Figure", 56.0, 79.0),
11358 ("6.4:", 81.0, 94.0),
11359 ("Lake", 96.0, 116.0),
11360 ("Sturgeon", 118.0, 158.0),
11361 ],
11362 104.0,
11363 116.0,
11364 ),
11365 ];
11366
11367 let bridge = extract_layout_narrative_bridge(576.0, &lines, &plate).unwrap();
11368 assert!(bridge
11369 .bridge_paragraph
11370 .as_deref()
11371 .is_some_and(|text| text.contains("Public aquariums") && text.contains("expertise")));
11372 assert_eq!(bridge.deferred_captions.len(), 2);
11373 assert!(bridge.deferred_captions[0].contains("Figure 6.3:"));
11374 assert!(bridge.deferred_captions[0].contains("species."));
11375 }
11376
11377 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11378 #[test]
11379 fn test_detect_layout_ocr_benchmark_dashboard_on_real_pdf() {
11380 let path =
11381 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000199.pdf");
11382 let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
11383 let dashboard = detect_layout_ocr_benchmark_dashboard(page_width, &lines).unwrap();
11384
11385 assert_eq!(
11386 dashboard.title,
11387 "Base Model Performance Evaluation of Upstage OCR Pack"
11388 );
11389 assert_eq!(dashboard.left_columns.len(), 2);
11390 assert_eq!(
11391 dashboard.left_columns[0],
11392 "Scene (Photographed document image)"
11393 );
11394 assert_eq!(
11395 dashboard.left_rows[0],
11396 vec![
11397 "Company A²".to_string(),
11398 "70.23".to_string(),
11399 "80.41".to_string()
11400 ]
11401 );
11402 assert_eq!(
11403 dashboard.right_rows[0],
11404 vec![
11405 "OCR-Recall³".to_string(),
11406 "73.2".to_string(),
11407 "94.2".to_string(),
11408 "94.1".to_string()
11409 ]
11410 );
11411 assert_eq!(dashboard.right_rows[3][0], "Parsing-F¹");
11412 assert_eq!(dashboard.right_rows[3][1], "68.0");
11413 assert_eq!(dashboard.right_rows[3][2], "82.65");
11414 assert_eq!(dashboard.right_rows[3][3], "82.65");
11415 assert!(!dashboard.definition_notes.is_empty());
11416 assert!(!dashboard.source_notes.is_empty());
11417 }
11418
11419 #[cfg(not(target_arch = "wasm32"))]
11420 #[test]
11421 fn test_split_layout_line_spans_handles_unicode_boundaries() {
11422 let line = "Title “Podcast #EP32: SDGs dan Anak Muda” 2024";
11423 let spans = split_layout_line_spans(line);
11424 assert_eq!(spans.len(), 3);
11425 assert_eq!(spans[0].1, "Title");
11426 assert!(spans[1].1.contains("Podcast #EP32: SDGs dan Anak Muda"));
11427 assert!(spans[1].1.ends_with('”'));
11428 assert!(spans[2].1.ends_with("24"));
11429 }
11430
11431 #[cfg(not(target_arch = "wasm32"))]
11432 #[test]
11433 fn test_render_layout_single_caption_chart_document_on_real_pdf() {
11434 let path =
11435 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000037.pdf");
11436 let doc = PdfDocument {
11437 title: None,
11438 source_path: Some(path.to_string_lossy().to_string()),
11439 number_of_pages: 1,
11440 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11441 .unwrap()
11442 .kids,
11443 ..PdfDocument::new("01030000000037.pdf".to_string())
11444 };
11445 let rendered = render_layout_single_caption_chart_document(&doc).unwrap();
11446 assert!(rendered.contains("# 3. Impact on Business Operations"));
11447 assert!(rendered.contains("## 3.1. Status of Business Operations"));
11448 assert!(rendered.contains("As shown in Figure 3.1.1, the number of MSMEs"));
11449 assert!(
11450 rendered.contains("Figure 3.1.1: Status of operations during each survey phase (%)")
11451 );
11452 assert!(
11453 rendered.contains("lockdown period. In the handicraft/textile sector, 30% of MSMEs")
11454 );
11455 assert!(!rendered.contains("| Lockdown Period |"));
11456 }
11457
11458 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11459 #[test]
11460 fn test_to_markdown_captioned_media_document_on_real_pdf_72() {
11461 let path =
11462 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000072.pdf");
11463 let doc = PdfDocument {
11464 title: None,
11465 source_path: Some(path.to_string_lossy().to_string()),
11466 number_of_pages: 1,
11467 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11468 .unwrap()
11469 .kids,
11470 ..PdfDocument::new("01030000000072.pdf".to_string())
11471 };
11472 let md = to_markdown(&doc).unwrap();
11473 assert!(md.contains("## Diagram 5"), "{md}");
11474 assert!(
11475 md.contains("**Distribution of Komnas HAM’s YouTube Content (2019-2020)**"),
11476 "{md}"
11477 );
11478 assert!(
11479 md.contains(
11480 "As of 1 December 2021, the Komnas HAM’s YouTube channel has 2,290 subscribers"
11481 ),
11482 "{md}"
11483 );
11484 assert!(md.contains("**Figure 4**"), "{md}");
11485 assert!(
11486 md.contains("*Komnas HAM’s YouTube channel as of 1 December 2021*"),
11487 "{md}"
11488 );
11489 }
11490
11491 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11492 #[test]
11493 fn test_to_markdown_captioned_media_document_on_real_pdf_73() {
11494 let path =
11495 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000073.pdf");
11496 let doc = PdfDocument {
11497 title: None,
11498 source_path: Some(path.to_string_lossy().to_string()),
11499 number_of_pages: 1,
11500 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11501 .unwrap()
11502 .kids,
11503 ..PdfDocument::new("01030000000073.pdf".to_string())
11504 };
11505 let md = to_markdown(&doc).unwrap();
11506 assert!(
11507 md.starts_with("# In this content, DPN Argentina provides a brief explanation"),
11508 "{md}"
11509 );
11510 assert!(
11511 md.contains("Examples of such greetings are as follows:"),
11512 "{md}"
11513 );
11514 assert!(md.contains("*Image*"), "{md}");
11515 assert!(md.contains("**Figure 6**"), "{md}");
11516 assert!(md.contains("**DPN Argentina**"), "{md}");
11517 assert!(
11518 md.contains("**Content: World Health Day Celebration (7 April 2021).**^98"),
11519 "{md}"
11520 );
11521 assert!(md.contains("**Footnote:**"), "{md}");
11522 assert!(
11523 md.contains("https://twitter.com/DPNArgentina/status/1379765916259483648."),
11524 "{md}"
11525 );
11526 }
11527
11528 #[cfg(not(target_arch = "wasm32"))]
11529 #[test]
11530 fn test_render_layout_captioned_media_document_does_not_fire_on_real_pdf_14() {
11531 let path =
11532 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
11533 let doc = PdfDocument {
11534 title: None,
11535 source_path: Some(path.to_string_lossy().to_string()),
11536 number_of_pages: 1,
11537 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11538 .unwrap()
11539 .kids,
11540 ..PdfDocument::new("01030000000014.pdf".to_string())
11541 };
11542 assert!(render_layout_captioned_media_document(&doc).is_none());
11543 }
11544
11545 #[cfg(not(target_arch = "wasm32"))]
11546 #[test]
11547 fn test_to_markdown_real_pdf_14_preserves_body_paragraphs() {
11548 let path =
11549 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000014.pdf");
11550 let doc = PdfDocument {
11551 title: None,
11552 source_path: Some(path.to_string_lossy().to_string()),
11553 number_of_pages: 1,
11554 kids: crate::convert(&path, &crate::api::config::ProcessingConfig::default())
11555 .unwrap()
11556 .kids,
11557 ..PdfDocument::new("01030000000014.pdf".to_string())
11558 };
11559 let md = to_markdown(&doc).unwrap();
11560 assert!(
11561 md.contains("These images also show that different areas are used by men and by women"),
11562 "{md}"
11563 );
11564 }
11565
11566 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11567 #[test]
11568 fn test_render_layout_recommendation_infographic_on_real_pdf() {
11569 let path =
11570 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000183.pdf");
11571 let doc = PdfDocument {
11572 title: None,
11573 source_path: Some(path.to_string_lossy().to_string()),
11574 number_of_pages: 1,
11575 kids: Vec::new(),
11576 ..PdfDocument::new("01030000000183.pdf".to_string())
11577 };
11578 let rendered = render_layout_recommendation_infographic_document(&doc).unwrap();
11579 assert!(rendered.contains("# Recommendation Pack: Track Record"));
11580 assert!(rendered.contains("## Comparison with Beauty Commerce Recommendation Models"));
11581 assert!(rendered.contains("| Graph-RecSys | 0.4048 |"));
11582 assert!(rendered.contains("| Current Service Recommendation Algorithm | 0.159 |"));
11583 assert!(rendered.contains("## Education Content Platform PoC Case"));
11584 assert!(rendered.contains("| DKT Model | 0.882 |"));
11585 assert!(rendered.contains("Compared to regular model"));
11586 }
11587
11588 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11589 #[test]
11590 fn test_render_layout_stacked_bar_report_on_real_pdf() {
11591 let path =
11592 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000038.pdf");
11593 let doc = PdfDocument {
11594 title: None,
11595 source_path: Some(path.to_string_lossy().to_string()),
11596 number_of_pages: 1,
11597 kids: Vec::new(),
11598 ..PdfDocument::new("01030000000038.pdf".to_string())
11599 };
11600 let rendered = render_layout_stacked_bar_report_document(&doc);
11601 if rendered.is_none() {
11602 let (page_width, lines) = read_pdftotext_bbox_layout_lines(&path).unwrap();
11603 let blocks = collect_bbox_layout_blocks(&lines);
11604 let figures = collect_layout_figure_captions(&blocks);
11605 let narrative = detect_layout_stacked_bar_narrative(&blocks);
11606 eprintln!("page_width={page_width} figures={}", figures.len());
11607 if let Some(first) = figures.first() {
11608 eprintln!("figure1={}", bbox_layout_block_text(first));
11609 }
11610 if let Some(second) = figures.get(1) {
11611 eprintln!("figure2={}", bbox_layout_block_text(second));
11612 }
11613 eprintln!("narrative={}", narrative.is_some());
11614 if let Some(narrative) = &narrative {
11615 eprintln!("heading={}", narrative.heading);
11616 eprintln!("paragraphs={}", narrative.paragraphs.len());
11617 eprintln!("footnote={:?}", narrative.footnote);
11618 }
11619 for block in &blocks {
11620 let text = bbox_layout_block_text(block);
11621 if text.contains("July")
11622 || text.contains("October")
11623 || text.contains("January")
11624 || text.contains("Will ")
11625 || text.contains("Don’t")
11626 || text.starts_with("6.2.")
11627 || text.starts_with("5.")
11628 {
11629 eprintln!(
11630 "block top={:.1} bottom={:.1} left={:.1} right={:.1} text={}",
11631 block.bbox.top_y,
11632 block.bbox.bottom_y,
11633 block.bbox.left_x,
11634 block.bbox.right_x,
11635 text
11636 );
11637 }
11638 }
11639 if figures.len() >= 2 {
11640 let first = detect_layout_three_month_stacked_figure(
11641 &blocks,
11642 &lines,
11643 page_width,
11644 figures[0].clone(),
11645 figures[1].bbox.top_y,
11646 );
11647 eprintln!("figure_one_ok={}", first.is_some());
11648 if let Some(narrative) = &narrative {
11649 let second = detect_layout_sector_bar_figure(
11650 &blocks,
11651 &lines,
11652 page_width,
11653 figures[1].clone(),
11654 narrative.top_y,
11655 );
11656 eprintln!("figure_two_ok={}", second.is_some());
11657 }
11658 }
11659 }
11660 let rendered = rendered.unwrap();
11661 assert!(rendered.contains("# Figure 6.1.1:"));
11662 assert!(rendered.contains("| Will not terminate employment | 51 | 81 | 73 |"));
11663 assert!(rendered.contains("# 6.2. Expectations for Re-Hiring Employees"));
11664 }
11665
11666 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11667 #[test]
11668 fn test_render_layout_multi_figure_chart_document_on_real_pdf() {
11669 let path =
11670 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000076.pdf");
11671 let doc = PdfDocument {
11672 title: None,
11673 source_path: Some(path.to_string_lossy().to_string()),
11674 number_of_pages: 1,
11675 kids: Vec::new(),
11676 ..PdfDocument::new("01030000000076.pdf".to_string())
11677 };
11678 let rendered = render_layout_multi_figure_chart_document(&doc).unwrap();
11679 assert!(rendered.contains("# Figures from the Document"));
11680 assert!(
11681 rendered.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
11682 );
11683 assert!(rendered.contains("| 2016 | 3,230 |"));
11684 assert!(rendered.contains("| 2021 | 2,693 |"));
11685 assert!(
11686 rendered.contains("## Figure 1.8. Singapore foreign workforce stock (in thousands)")
11687 );
11688 assert!(rendered.contains("| 2016 (Dec) | 1,393 |"));
11689 assert!(rendered.contains("| 2021 (Dec) | 1,200 |"));
11690 assert!(rendered.contains(
11691 "Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate."
11692 ));
11693 }
11694
11695 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11696 #[test]
11697 fn test_render_layout_open_plate_document_on_real_pdf() {
11698 let path =
11699 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
11700 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11701 let rendered = render_layout_open_plate_document(&doc).unwrap();
11702 assert!(rendered.contains("# Fish species on IUCN Red List"));
11703 assert!(rendered.contains("| Potosi Pupfish | Cyprinodon alvarezi |"));
11704 assert!(rendered.contains("| Golden Skiffia | Skiffia francesae |"));
11705 assert!(rendered.contains("*Table 6.1: Four fish species on IUCN Red List"));
11706 assert!(rendered.contains("---"));
11707 assert!(rendered.contains("Public aquariums, because of their inhouse expertise"));
11708 }
11709
11710 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11711 #[test]
11712 fn test_to_markdown_open_plate_document_on_real_pdf() {
11713 let path =
11714 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000132.pdf");
11715 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11716 let md = to_markdown(&doc).unwrap();
11717
11718 assert!(md.contains("# Fish species on IUCN Red List"), "{md}");
11719 assert!(
11720 md.contains("| Potosi Pupfish | Cyprinodon alvarezi |"),
11721 "{md}"
11722 );
11723 assert!(
11724 md.contains("| Golden Skiffia | Skiffia francesae |"),
11725 "{md}"
11726 );
11727 assert!(
11728 md.contains("*Table 6.1: Four fish species on IUCN Red List"),
11729 "{md}"
11730 );
11731 assert!(
11732 md.contains("The breeding colonies of the Butterfly Splitfin"),
11733 "{md}"
11734 );
11735 }
11736
11737 #[cfg(not(target_arch = "wasm32"))]
11738 #[test]
11739 fn test_to_markdown_does_not_misclassify_open_plate_pdf_36() {
11740 let path =
11741 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000036.pdf");
11742 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11743 let md = to_markdown(&doc).unwrap();
11744
11745 assert!(md.contains("# 2. General Profile of MSMEs"), "{md}");
11746 assert!(
11747 md.contains("In July 2020, the survey established a general profile"),
11748 "{md}"
11749 );
11750 assert!(
11751 md.contains(
11752 "The tourism sub-sectors interviewed included lodging, restaurants and bars"
11753 ),
11754 "{md}"
11755 );
11756 assert!(
11757 !md.starts_with("# Business characteristics. Business size was"),
11758 "{md}"
11759 );
11760 }
11761
11762 #[cfg(not(target_arch = "wasm32"))]
11763 #[test]
11764 fn test_to_markdown_does_not_misclassify_open_plate_pdf_40() {
11765 let path =
11766 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000040.pdf");
11767 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11768 let md = to_markdown(&doc).unwrap();
11769
11770 assert!(
11771 md.contains(
11772 "Thailand, Philippines and Indonesia in particular, identifying known experts"
11773 ),
11774 "{md}"
11775 );
11776 assert!(
11777 md.contains("Figure 1: Age by gender of respondents"),
11778 "{md}"
11779 );
11780 assert!(md.contains("Gender Analysis of Violent Extremism"), "{md}");
11781 assert!(
11782 !md.starts_with("# Thailand, Philippines and Indonesia in"),
11783 "{md}"
11784 );
11785 }
11786
11787 #[cfg(not(target_arch = "wasm32"))]
11788 #[test]
11789 fn test_to_markdown_does_not_misclassify_open_plate_pdf_64() {
11790 let path =
11791 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000064.pdf");
11792 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11793 let md = to_markdown(&doc).unwrap();
11794
11795 assert!(md.contains("estuarine influenced areas."), "{md}");
11796 assert!(md.contains("| MANILA | 2454 | 6,125 |"), "{md}");
11797 assert!(
11798 md.contains("The port of Manila has been documented"),
11799 "{md}"
11800 );
11801 assert!(!md.starts_with("# CAGAYAN DE ORO"), "{md}");
11802 }
11803
11804 #[cfg(not(target_arch = "wasm32"))]
11805 #[test]
11806 fn test_detect_footnote_citation_regions_on_real_pdf() {
11807 let path =
11808 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
11809 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11810 let regions = detect_footnote_citation_regions(&doc);
11811 assert!(!regions.is_empty(), "{regions:?}");
11812 assert!(
11813 regions.iter().any(|region| {
11814 region.rendered.contains("<table>")
11815 && region.rendered.contains("<td>25</td>")
11816 && region.rendered.contains("<td>29</td>")
11817 }),
11818 "{regions:#?}"
11819 );
11820 assert!(
11821 regions.iter().any(|region| {
11822 region.rendered.contains("<table>")
11823 && region.rendered.contains("<td>30</td>")
11824 && region.rendered.contains("<td>33</td>")
11825 }),
11826 "{regions:#?}"
11827 );
11828 }
11829
11830 #[cfg(not(target_arch = "wasm32"))]
11831 #[test]
11832 fn test_to_markdown_renders_footnote_citation_tables_on_real_pdf() {
11833 let path =
11834 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000008.pdf");
11835 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11836 let md = to_markdown(&doc).unwrap();
11837
11838 assert!(md.contains("<table>"), "{md}");
11839 assert!(md.contains("<th>Footnote</th><th>Citation</th>"), "{md}");
11840 assert!(md.contains("<td>25</td><td>Wiliam Beckford"), "{md}");
11841 assert!(
11842 md.contains("<td>29</td><td>Pope, The Rape of the Lock, 69.</td>"),
11843 "{md}"
11844 );
11845 assert!(
11846 md.contains("<td>30</td><td>Beawes, Lex Mercatoria Rediviva, 791.</td>"),
11847 "{md}"
11848 );
11849 assert!(
11850 md.contains("<td>32</td><td>Beawes, Lex Mercatoria Rediviva, 792.</td>"),
11851 "{md}"
11852 );
11853 assert!(
11854 md.contains("<td>33</td><td>M.M., Pharmacopoia Reformata:"),
11855 "{md}"
11856 );
11857 }
11858
11859 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11860 #[test]
11861 fn test_to_markdown_projection_sheet_document_on_real_pdf() {
11862 let path =
11863 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000128.pdf");
11864 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11865 let md = to_markdown(&doc).unwrap();
11866
11867 assert!(md.contains("# Table and Figure from the Document"), "{md}");
11868 assert!(md.contains("| A | B | C | D | E |"), "{md}");
11869 assert!(
11870 md.contains("| 10 | 8 | 19.73214458 | 17.99 | 21.47 |"),
11871 "{md}"
11872 );
11873 assert!(
11874 md.contains("**Figure 13.3. Graph of Projection Estimates**"),
11875 "{md}"
11876 );
11877 assert!(md.contains("[Open Template in Microsoft Excel](#)"), "{md}");
11878 assert!(
11879 md.contains("*298 | Ch. 13. Homogeneous Investment Types*"),
11880 "{md}"
11881 );
11882 }
11883
11884 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11885 #[test]
11886 fn test_to_markdown_appendix_tables_document_on_real_pdf() {
11887 let path =
11888 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000082.pdf");
11889 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11890 let md = to_markdown(&doc).unwrap();
11891
11892 assert!(md.contains("# Appendices"), "{md}");
11893 assert!(
11894 md.contains("## TABLE 28: BREAKDOWN OF IMPRISONMENT CLAUSES IN STATE LAWS"),
11895 "{md}"
11896 );
11897 assert!(md.contains("| Imprisonment terms | Number of clauses | Percentage of all states | Percentage of total |"), "{md}");
11898 assert!(
11899 md.contains("| Less than 3 months | 4,448 | 21.3% | 17.0% |"),
11900 "{md}"
11901 );
11902 assert!(
11903 md.contains("## TABLE 29: STATES WITH MORE THAN 1,000 IMPRISONMENT CLAUSES"),
11904 "{md}"
11905 );
11906 assert!(
11907 md.contains(
11908 "| State | Number of clauses | GSDP (In Rs lakh crore) | GSDP (In $ billion) |"
11909 ),
11910 "{md}"
11911 );
11912 assert!(md.contains("| Gujarat | 1469 | 15.6 | 200.4 |"), "{md}");
11913 assert!(
11914 md.contains("*Sources: TeamLease Regtech, and Reserve Bank of India for GSDPs*"),
11915 "{md}"
11916 );
11917 assert!(md.contains("*Exchange rate: Rs 75 to USD*"), "{md}");
11918 }
11919
11920 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11921 #[test]
11922 fn test_to_markdown_titled_dual_table_document_on_real_pdf() {
11923 let path =
11924 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000084.pdf");
11925 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11926 let md = to_markdown(&doc).unwrap();
11927
11928 assert!(md.starts_with("# Jailed for Doing Business"), "{md}");
11929 assert!(
11930 md.contains("## TABLE 38: THREE CASE STUDIES ON NBFC COMPLIANCES*"),
11931 "{md}"
11932 );
11933 assert!(
11934 md.contains("| Percentage of imprisonment clauses | 20% | 30% | 37% |"),
11935 "{md}"
11936 );
11937 assert!(
11938 md.contains("## TABLE 39: BREAKDOWN OF IMPRISONMENT CLAUSES IN NBFC CASE STUDIES*"),
11939 "{md}"
11940 );
11941 assert!(
11942 md.contains("| 5 years to 10 years | 19 | 19 | 19 |"),
11943 "{md}"
11944 );
11945 assert!(
11946 md.contains("*These are real data from three NBFCs*"),
11947 "{md}"
11948 );
11949 }
11950
11951 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11952 #[test]
11953 fn test_to_markdown_registration_report_document_on_real_pdf() {
11954 let path =
11955 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000047.pdf");
11956 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11957 let md = to_markdown(&doc).unwrap();
11958
11959 assert!(
11960 md.starts_with("# ANFREL Pre-Election Assessment Mission Report"),
11961 "{md}"
11962 );
11963 assert!(
11964 md.contains(
11965 "| 14 | Cambodian Indigeneous Peoples Democracy Party | 19 | 194 | 19 | 202 | +8 |"
11966 ),
11967 "{md}"
11968 );
11969 assert!(
11970 md.contains("| | Total | | 84,208 | | 86,092 | +1,884 |"),
11971 "{md}"
11972 );
11973 assert!(!md.contains("| | Democracy Party |"), "{md}");
11974 }
11975
11976 #[cfg(all(not(target_arch = "wasm32"), not(target_os = "windows")))]
11977 #[test]
11978 fn test_to_markdown_dual_table_article_document_on_real_pdf() {
11979 let path =
11980 Path::new(env!("CARGO_MANIFEST_DIR")).join("../../benchmark/pdfs/01030000000190.pdf");
11981 let doc = crate::convert(&path, &crate::api::config::ProcessingConfig::default()).unwrap();
11982 let md = to_markdown(&doc).unwrap();
11983
11984 assert!(
11985 md.starts_with("# Table 6: Performance comparison amongst the merge candidates"),
11986 "{md}"
11987 );
11988 assert!(
11989 md.contains("*Table 6*: Performance comparison amongst the merge candidates."),
11990 "{md}"
11991 );
11992 assert!(md.contains("# Table 7: Ablation studies on the different merge methods used for obtaining the final model"), "{md}");
11993 assert!(!md.contains("*Table 6*: Table 6:"), "{md}");
11994 assert!(!md.contains("| Merge v1"), "{md}");
11995 }
11996
11997 #[test]
11998 fn test_normalize_list_text_strips_redundant_bullets() {
11999 assert_eq!(
12000 normalize_list_text("• Collected via surveys"),
12001 "Collected via surveys"
12002 );
12003 assert!(is_pure_bullet_marker("•"));
12004 }
12005
12006 #[test]
12007 fn test_reference_continuation_detected() {
12008 assert!(should_merge_paragraph_text(
12009 "Scaling laws for transfer.",
12010 "arXiv preprint arXiv:2102.01293."
12011 ));
12012 }
12013
12014 #[test]
12015 fn test_enumerated_markers_are_detected() {
12016 assert!(starts_with_enumerated_marker("iii. Third item"));
12017 assert!(starts_with_enumerated_marker("1) First item"));
12018 assert!(starts_with_enumerated_marker("a. Lettered item"));
12019 assert!(!starts_with_enumerated_marker("Figure 1. Caption"));
12020 assert!(!starts_with_enumerated_marker("Natural dispersal"));
12021 }
12022
12023 fn make_heading(text: &str) -> ContentElement {
12024 let bbox = BoundingBox::new(Some(1), 72.0, 700.0, 300.0, 712.0);
12025 let chunk = TextChunk {
12026 value: text.to_string(),
12027 bbox: bbox.clone(),
12028 font_name: "Lato-Bold".to_string(),
12029 font_size: 12.0,
12030 font_weight: 700.0,
12031 italic_angle: 0.0,
12032 font_color: "#000000".to_string(),
12033 contrast_ratio: 21.0,
12034 symbol_ends: vec![],
12035 text_format: TextFormat::Normal,
12036 text_type: TextType::Regular,
12037 pdf_layer: PdfLayer::Main,
12038 ocg_visible: true,
12039 index: None,
12040 page_number: Some(1),
12041 level: None,
12042 mcid: None,
12043 };
12044 let line = TextLine {
12045 bbox: bbox.clone(),
12046 index: None,
12047 level: None,
12048 font_size: 12.0,
12049 base_line: 702.0,
12050 slant_degree: 0.0,
12051 is_hidden_text: false,
12052 text_chunks: vec![chunk],
12053 is_line_start: true,
12054 is_line_end: true,
12055 is_list_line: false,
12056 connected_line_art_label: None,
12057 };
12058 let block = TextBlock {
12059 bbox: bbox.clone(),
12060 index: None,
12061 level: None,
12062 font_size: 12.0,
12063 base_line: 702.0,
12064 slant_degree: 0.0,
12065 is_hidden_text: false,
12066 text_lines: vec![line],
12067 has_start_line: true,
12068 has_end_line: true,
12069 text_alignment: None,
12070 };
12071 let column = TextColumn {
12072 bbox: bbox.clone(),
12073 index: None,
12074 level: None,
12075 font_size: 12.0,
12076 base_line: 702.0,
12077 slant_degree: 0.0,
12078 is_hidden_text: false,
12079 text_blocks: vec![block],
12080 };
12081 ContentElement::Heading(SemanticHeading {
12082 base: SemanticParagraph {
12083 base: SemanticTextNode {
12084 bbox,
12085 index: None,
12086 level: None,
12087 semantic_type: crate::models::enums::SemanticType::Heading,
12088 correct_semantic_score: None,
12089 columns: vec![column],
12090 font_weight: Some(700.0),
12091 font_size: Some(12.0),
12092 text_color: None,
12093 italic_angle: None,
12094 font_name: Some("Lato-Bold".to_string()),
12095 text_format: None,
12096 max_font_size: Some(12.0),
12097 background_color: None,
12098 is_hidden_text: false,
12099 },
12100 enclosed_top: false,
12101 enclosed_bottom: false,
12102 indentation: 0,
12103 },
12104 heading_level: Some(1),
12105 })
12106 }
12107
12108 fn make_heading_at(left: f64, bottom: f64, right: f64, top: f64, text: &str) -> ContentElement {
12109 let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
12110 let chunk = TextChunk {
12111 value: text.to_string(),
12112 bbox: bbox.clone(),
12113 font_name: "Lato-Bold".to_string(),
12114 font_size: top - bottom,
12115 font_weight: 700.0,
12116 italic_angle: 0.0,
12117 font_color: "#000000".to_string(),
12118 contrast_ratio: 21.0,
12119 symbol_ends: vec![],
12120 text_format: TextFormat::Normal,
12121 text_type: TextType::Regular,
12122 pdf_layer: PdfLayer::Main,
12123 ocg_visible: true,
12124 index: None,
12125 page_number: Some(1),
12126 level: None,
12127 mcid: None,
12128 };
12129 let line = TextLine {
12130 bbox: bbox.clone(),
12131 index: None,
12132 level: None,
12133 font_size: top - bottom,
12134 base_line: bottom + 2.0,
12135 slant_degree: 0.0,
12136 is_hidden_text: false,
12137 text_chunks: vec![chunk],
12138 is_line_start: true,
12139 is_line_end: true,
12140 is_list_line: false,
12141 connected_line_art_label: None,
12142 };
12143 let block = TextBlock {
12144 bbox: bbox.clone(),
12145 index: None,
12146 level: None,
12147 font_size: top - bottom,
12148 base_line: bottom + 2.0,
12149 slant_degree: 0.0,
12150 is_hidden_text: false,
12151 text_lines: vec![line],
12152 has_start_line: true,
12153 has_end_line: true,
12154 text_alignment: None,
12155 };
12156 let column = TextColumn {
12157 bbox: bbox.clone(),
12158 index: None,
12159 level: None,
12160 font_size: top - bottom,
12161 base_line: bottom + 2.0,
12162 slant_degree: 0.0,
12163 is_hidden_text: false,
12164 text_blocks: vec![block],
12165 };
12166 ContentElement::Heading(SemanticHeading {
12167 base: SemanticParagraph {
12168 base: SemanticTextNode {
12169 bbox,
12170 index: None,
12171 level: None,
12172 semantic_type: crate::models::enums::SemanticType::Heading,
12173 correct_semantic_score: None,
12174 columns: vec![column],
12175 font_weight: Some(700.0),
12176 font_size: Some(top - bottom),
12177 text_color: None,
12178 italic_angle: None,
12179 font_name: Some("Lato-Bold".to_string()),
12180 text_format: None,
12181 max_font_size: Some(top - bottom),
12182 background_color: None,
12183 is_hidden_text: false,
12184 },
12185 enclosed_top: false,
12186 enclosed_bottom: false,
12187 indentation: 0,
12188 },
12189 heading_level: None,
12190 })
12191 }
12192
12193 fn make_paragraph(text: &str, bottom: f64, top: f64) -> ContentElement {
12194 make_paragraph_at(72.0, bottom, 300.0, top, text)
12195 }
12196
12197 fn make_paragraph_at(
12198 left: f64,
12199 bottom: f64,
12200 right: f64,
12201 top: f64,
12202 text: &str,
12203 ) -> ContentElement {
12204 let bbox = BoundingBox::new(Some(1), left, bottom, right, top);
12205 let chunk = TextChunk {
12206 value: text.to_string(),
12207 bbox: bbox.clone(),
12208 font_name: "Lato-Regular".to_string(),
12209 font_size: (top - bottom).max(1.0),
12210 font_weight: 400.0,
12211 italic_angle: 0.0,
12212 font_color: "#000000".to_string(),
12213 contrast_ratio: 21.0,
12214 symbol_ends: vec![],
12215 text_format: TextFormat::Normal,
12216 text_type: TextType::Regular,
12217 pdf_layer: PdfLayer::Main,
12218 ocg_visible: true,
12219 index: None,
12220 page_number: Some(1),
12221 level: None,
12222 mcid: None,
12223 };
12224 let line = TextLine {
12225 bbox: bbox.clone(),
12226 index: None,
12227 level: None,
12228 font_size: chunk.font_size,
12229 base_line: bottom + 2.0,
12230 slant_degree: 0.0,
12231 is_hidden_text: false,
12232 text_chunks: vec![chunk],
12233 is_line_start: true,
12234 is_line_end: true,
12235 is_list_line: false,
12236 connected_line_art_label: None,
12237 };
12238 let block = TextBlock {
12239 bbox: bbox.clone(),
12240 index: None,
12241 level: None,
12242 font_size: line.font_size,
12243 base_line: line.base_line,
12244 slant_degree: 0.0,
12245 is_hidden_text: false,
12246 text_lines: vec![line],
12247 has_start_line: true,
12248 has_end_line: true,
12249 text_alignment: None,
12250 };
12251 let column = TextColumn {
12252 bbox: bbox.clone(),
12253 index: None,
12254 level: None,
12255 font_size: block.font_size,
12256 base_line: block.base_line,
12257 slant_degree: 0.0,
12258 is_hidden_text: false,
12259 text_blocks: vec![block],
12260 };
12261 ContentElement::Paragraph(SemanticParagraph {
12262 base: SemanticTextNode {
12263 bbox,
12264 index: None,
12265 level: None,
12266 semantic_type: crate::models::enums::SemanticType::Paragraph,
12267 correct_semantic_score: None,
12268 columns: vec![column],
12269 font_weight: Some(400.0),
12270 font_size: Some(top - bottom),
12271 text_color: None,
12272 italic_angle: None,
12273 font_name: Some("Lato-Regular".to_string()),
12274 text_format: None,
12275 max_font_size: Some(top - bottom),
12276 background_color: None,
12277 is_hidden_text: false,
12278 },
12279 enclosed_top: false,
12280 enclosed_bottom: false,
12281 indentation: 0,
12282 })
12283 }
12284
12285 fn make_fallback_list(items: &[&str]) -> ContentElement {
12286 let mut list_items = Vec::new();
12287 for (idx, text) in items.iter().enumerate() {
12288 let top = 700.0 - idx as f64 * 18.0;
12289 let bottom = top - 12.0;
12290 let bbox = BoundingBox::new(Some(1), 72.0, bottom, 320.0, top);
12291 list_items.push(ListItem {
12292 bbox: bbox.clone(),
12293 index: None,
12294 level: None,
12295 label: ListLabel {
12296 bbox: bbox.clone(),
12297 content: vec![],
12298 semantic_type: None,
12299 },
12300 body: ListBody {
12301 bbox: bbox.clone(),
12302 content: vec![],
12303 semantic_type: None,
12304 },
12305 label_length: 0,
12306 contents: vec![make_paragraph_at(72.0, bottom, 320.0, top, text)],
12307 semantic_type: None,
12308 });
12309 }
12310
12311 ContentElement::List(PDFList {
12312 bbox: BoundingBox::new(
12313 Some(1),
12314 72.0,
12315 700.0 - items.len() as f64 * 18.0,
12316 320.0,
12317 700.0,
12318 ),
12319 index: None,
12320 level: None,
12321 list_items,
12322 numbering_style: Some("bullets".to_string()),
12323 common_prefix: None,
12324 previous_list_id: None,
12325 next_list_id: None,
12326 })
12327 }
12328
12329 fn make_toc_table(rows: &[(&str, &str)]) -> ContentElement {
12330 let mut table_rows = Vec::new();
12331 for (ri, (title, page)) in rows.iter().enumerate() {
12332 let top = 680.0 - ri as f64 * 18.0;
12333 let bottom = top - 12.0;
12334 let left_bbox = BoundingBox::new(Some(1), 72.0, bottom, 280.0, top);
12335 let right_bbox = BoundingBox::new(Some(1), 320.0, bottom, 360.0, top);
12336 table_rows.push(TableBorderRow {
12337 bbox: BoundingBox::new(Some(1), 72.0, bottom, 360.0, top),
12338 index: None,
12339 level: None,
12340 row_number: ri,
12341 cells: vec![
12342 TableBorderCell {
12343 bbox: left_bbox.clone(),
12344 index: None,
12345 level: None,
12346 row_number: ri,
12347 col_number: 0,
12348 row_span: 1,
12349 col_span: 1,
12350 content: vec![TableToken {
12351 base: TextChunk {
12352 value: (*title).to_string(),
12353 bbox: left_bbox,
12354 font_name: "Lato-Regular".to_string(),
12355 font_size: 10.0,
12356 font_weight: 400.0,
12357 italic_angle: 0.0,
12358 font_color: "#000000".to_string(),
12359 contrast_ratio: 21.0,
12360 symbol_ends: vec![],
12361 text_format: TextFormat::Normal,
12362 text_type: TextType::Regular,
12363 pdf_layer: PdfLayer::Main,
12364 ocg_visible: true,
12365 index: None,
12366 page_number: Some(1),
12367 level: None,
12368 mcid: None,
12369 },
12370 token_type: TableTokenType::Text,
12371 }],
12372 contents: vec![],
12373 semantic_type: None,
12374 },
12375 TableBorderCell {
12376 bbox: right_bbox.clone(),
12377 index: None,
12378 level: None,
12379 row_number: ri,
12380 col_number: 1,
12381 row_span: 1,
12382 col_span: 1,
12383 content: vec![TableToken {
12384 base: TextChunk {
12385 value: (*page).to_string(),
12386 bbox: right_bbox,
12387 font_name: "Lato-Regular".to_string(),
12388 font_size: 10.0,
12389 font_weight: 400.0,
12390 italic_angle: 0.0,
12391 font_color: "#000000".to_string(),
12392 contrast_ratio: 21.0,
12393 symbol_ends: vec![],
12394 text_format: TextFormat::Normal,
12395 text_type: TextType::Regular,
12396 pdf_layer: PdfLayer::Main,
12397 ocg_visible: true,
12398 index: None,
12399 page_number: Some(1),
12400 level: None,
12401 mcid: None,
12402 },
12403 token_type: TableTokenType::Text,
12404 }],
12405 contents: vec![],
12406 semantic_type: None,
12407 },
12408 ],
12409 semantic_type: None,
12410 });
12411 }
12412
12413 ContentElement::TableBorder(TableBorder {
12414 bbox: BoundingBox::new(Some(1), 72.0, 620.0, 360.0, 680.0),
12415 index: None,
12416 level: Some("1".to_string()),
12417 x_coordinates: vec![72.0, 320.0, 360.0],
12418 x_widths: vec![0.0, 0.0, 0.0],
12419 y_coordinates: vec![680.0, 662.0, 644.0, 626.0],
12420 y_widths: vec![0.0, 0.0, 0.0, 0.0],
12421 rows: table_rows,
12422 num_rows: rows.len(),
12423 num_columns: 2,
12424 is_bad_table: false,
12425 is_table_transformer: false,
12426 previous_table: None,
12427 next_table: None,
12428 })
12429 }
12430
12431 #[test]
12432 fn test_contents_document_renders_toc_table_rows() {
12433 let mut doc = PdfDocument::new("contents.pdf".to_string());
12434 doc.kids.push(make_heading("CONTENTS"));
12435 doc.kids.push(make_toc_table(&[
12436 ("Experiment #1: Hydrostatic Pressure", "3"),
12437 ("Experiment #2: Bernoulli's Theorem Demonstration", "13"),
12438 ("Experiment #3: Energy Loss in Pipe Fittings", "24"),
12439 ("Experiment #4: Energy Loss in Pipes", "33"),
12440 ("Experiment #5: Impact of a Jet", "43"),
12441 ("Experiment #6: Orifice and Free Jet Flow", "50"),
12442 ("Experiment #7: Osborne Reynolds' Demonstration", "59"),
12443 ("References", "101"),
12444 ]));
12445
12446 let md = to_markdown(&doc).unwrap();
12447 assert!(md.starts_with("# CONTENTS\n\n"));
12448 assert!(md.contains("- Experiment #1: Hydrostatic Pressure 3\n"));
12449 assert!(md.contains("- Experiment #2: Bernoulli's Theorem Demonstration 13\n"));
12450 assert!(md.contains("- Experiment #7: Osborne Reynolds' Demonstration 59\n"));
12451 assert!(md.contains("- References 101\n"));
12452 }
12453
12454 #[test]
12455 fn test_toc_semantic_paragraphs_render_without_blank_lines() {
12456 let mut doc = PdfDocument::new("toc-semantic.pdf".to_string());
12457 let mut first = make_paragraph(
12458 "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
12459 700.0,
12460 712.0,
12461 );
12462 let mut second = make_paragraph("Section 5.1: The Linear Model 35", 684.0, 696.0);
12463 if let ContentElement::Paragraph(p) = &mut first {
12464 p.base.semantic_type = SemanticType::TableOfContent;
12465 }
12466 if let ContentElement::Paragraph(p) = &mut second {
12467 p.base.semantic_type = SemanticType::TableOfContent;
12468 }
12469 doc.kids.push(first);
12470 doc.kids.push(second);
12471
12472 let md = to_markdown(&doc).unwrap();
12473 assert!(md.contains(
12474 "Part V. Chapter Five - Comparing Associations Between Multiple Variables\nSection 5.1: The Linear Model 35\n"
12475 ));
12476 }
12477
12478 #[test]
12479 fn test_compact_toc_document_renders_without_blank_lines() {
12480 let mut doc = PdfDocument::new("compact-toc.pdf".to_string());
12481 doc.kids.push(make_paragraph(
12482 "Part V. Chapter Five - Comparing Associations Between Multiple Variables",
12483 700.0,
12484 712.0,
12485 ));
12486 doc.kids.push(make_paragraph(
12487 "Section 5.1: The Linear Model 35",
12488 684.0,
12489 696.0,
12490 ));
12491 doc.kids.push(make_paragraph(
12492 "Part VI. Chapter Six - Comparing Three or More Group Means",
12493 668.0,
12494 680.0,
12495 ));
12496 doc.kids.push(make_paragraph(
12497 "Section 6.1: Between Versus Within Group Analyses 49",
12498 652.0,
12499 664.0,
12500 ));
12501 doc.kids.push(make_paragraph(
12502 "Part VII. Chapter Seven - Moderation and Mediation Analyses",
12503 636.0,
12504 648.0,
12505 ));
12506 doc.kids.push(make_paragraph(
12507 "Section 7.1: Mediation and Moderation Models 64",
12508 620.0,
12509 632.0,
12510 ));
12511 doc.kids
12512 .push(make_paragraph("References 101", 604.0, 616.0));
12513 doc.kids.push(make_paragraph(
12514 "Section 8.1: Factor Analysis Definitions 75",
12515 588.0,
12516 600.0,
12517 ));
12518
12519 let md = to_markdown(&doc).unwrap();
12520 assert!(md.contains(
12521 "# Part V. Chapter Five - Comparing Associations Between Multiple Variables\n\n## Section 5.1: The Linear Model"
12522 ));
12523 assert!(md.contains(
12524 "# Part VI. Chapter Six - Comparing Three or More Group Means\n\n## Section 6.1: Between Versus Within Group Analyses"
12525 ));
12526 assert!(md.contains("References 101\n\n## Section 8.1: Factor Analysis Definitions"));
12527 }
12528
12529 #[test]
12530 fn test_merged_caption_and_body_paragraph_renders_as_two_paragraphs() {
12531 let mut doc = PdfDocument::new("caption-body.pdf".to_string());
12532 doc.kids.push(make_paragraph(
12533 "Figure 1. This image shows the Western hemisphere as viewed from space 35,400 kilometers above Earth. (credit: modification of work by R. Stockli, NASA/ GSFC/ NOAA/ USGS) Our nearest astronomical neighbor is Earth's satellite, commonly called the Moon.",
12534 500.0,
12535 540.0,
12536 ));
12537
12538 let md = to_markdown(&doc).unwrap();
12539 assert!(md.contains("USGS)\n\nOur nearest astronomical neighbor"));
12540 }
12541
12542 #[test]
12543 fn test_short_caption_label_merges_with_following_tail_and_body() {
12544 let mut doc = PdfDocument::new("diagram-caption.pdf".to_string());
12545 doc.kids.push(make_paragraph("Diagram 5", 540.0, 552.0));
12546 doc.kids.push(make_paragraph(
12547 "Distribution of Komnas HAM's YouTube Content (2019- 2020) As of 1 December 2021, the channel has 2,290 subscribers and 185,676 total views.",
12548 520.0,
12549 532.0,
12550 ));
12551
12552 let md = to_markdown(&doc).unwrap();
12553 assert!(md.contains(
12554 "Diagram 5\nDistribution of Komnas HAM's YouTube Content (2019- 2020)\n\nAs of 1 December 2021, the channel has 2,290 subscribers"
12555 ));
12556 }
12557
12558 #[test]
12559 fn test_short_caption_label_merges_with_tail_and_year() {
12560 let mut doc = PdfDocument::new("figure-caption.pdf".to_string());
12561 doc.kids.push(make_paragraph("Figure 4", 540.0, 552.0));
12562 doc.kids.push(make_paragraph(
12563 "Komnas HAM's YouTube channel as of 1 December",
12564 520.0,
12565 532.0,
12566 ));
12567 doc.kids.push(make_paragraph("2021", 500.0, 512.0));
12568
12569 let md = to_markdown(&doc).unwrap();
12570 assert!(md.contains("Figure 4\nKomnas HAM's YouTube channel as of 1 December\n2021"));
12571 assert!(!md.contains("\n\n2021"));
12572 }
12573
12574 #[test]
12575 fn test_mid_page_numeric_labels_are_not_dropped_as_page_numbers() {
12576 let mut doc = PdfDocument::new("chart.pdf".to_string());
12577 doc.kids.push(make_paragraph("Figure 1", 760.0, 772.0));
12578 doc.kids.push(make_paragraph("100", 520.0, 528.0));
12579 doc.kids
12580 .push(make_paragraph("Body text continues here.", 400.0, 412.0));
12581 doc.kids.push(make_paragraph("36", 20.0, 28.0));
12582
12583 let md = to_markdown(&doc).unwrap();
12584 assert!(md.contains("100"));
12585 assert!(!md.lines().any(|line| line.trim() == "36"));
12586 }
12587
12588 #[test]
12589 fn test_semantic_paragraphs_are_not_remerged_in_markdown() {
12590 let mut doc = PdfDocument::new("paragraphs.pdf".to_string());
12591 doc.kids.push(make_paragraph(
12592 "First semantic paragraph ends here.",
12593 520.0,
12594 532.0,
12595 ));
12596 doc.kids.push(make_paragraph(
12597 "Second semantic paragraph starts here.",
12598 500.0,
12599 512.0,
12600 ));
12601
12602 let md = to_markdown(&doc).unwrap();
12603 assert!(md.contains(
12604 "First semantic paragraph ends here.\n\nSecond semantic paragraph starts here."
12605 ));
12606 }
12607
12608 #[test]
12609 fn test_lowercase_semantic_paragraph_continuation_is_merged() {
12610 let mut doc = PdfDocument::new("continuation.pdf".to_string());
12611 doc.kids.push(make_paragraph(
12612 "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference",
12613 520.0,
12614 532.0,
12615 ));
12616 doc.kids.push(make_paragraph("of interest.", 500.0, 512.0));
12617
12618 let md = to_markdown(&doc).unwrap();
12619 assert!(md.contains(
12620 "You can then compare the difference you actually obtained against this null distribution to generate a p value for your difference of interest."
12621 ));
12622 }
12623
12624 #[test]
12625 fn test_semantic_enumerated_paragraphs_are_not_merged() {
12626 let mut doc = PdfDocument::new("enumerated-paragraphs.pdf".to_string());
12627 doc.kids.push(make_paragraph(
12628 "iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
12629 520.0,
12630 532.0,
12631 ));
12632 doc.kids.push(make_paragraph(
12633 "iv. This business model will be operating cost-oriented not capital cost-oriented.",
12634 500.0,
12635 512.0,
12636 ));
12637
12638 let md = to_markdown(&doc).unwrap();
12639 assert!(md.contains(
12640 "iii. Looking at cost items, the cost of raw woods procurement will be highest share.\n\niv. This business model will be operating cost-oriented not capital cost-oriented."
12641 ));
12642 }
12643
12644 #[test]
12645 fn test_leading_figure_carryover_is_skipped_before_first_numbered_heading() {
12646 let mut doc = PdfDocument::new("leading-figure-carryover.pdf".to_string());
12647 doc.number_of_pages = 1;
12648 doc.kids.push(make_paragraph_at(
12649 72.0,
12650 742.0,
12651 540.0,
12652 756.0,
12653 "Figure 6. Mytella strigata biofouling green mussel farms in Bacoor City, Cavite, Manila Bay",
12654 ));
12655 doc.kids.push(make_heading_at(
12656 72.0,
12657 680.0,
12658 260.0,
12659 696.0,
12660 "5. Natural dispersal",
12661 ));
12662 doc.kids.push(make_paragraph_at(
12663 72.0,
12664 640.0,
12665 540.0,
12666 654.0,
12667 "Dispersal by purely natural means is not included as a pathway of biological invasions.",
12668 ));
12669
12670 let md = to_markdown(&doc).unwrap();
12671 assert!(md.starts_with("# 5. Natural dispersal"));
12672 assert!(!md.contains("Figure 6. Mytella strigata"));
12673 }
12674
12675 #[test]
12676 fn test_list_renderer_strips_duplicate_bullets_and_skips_bullet_only_items() {
12677 let mut doc = PdfDocument::new("bullets.pdf".to_string());
12678 doc.kids.push(make_fallback_list(&[
12679 "• First item",
12680 "•",
12681 "• Second item",
12682 "133",
12683 ]));
12684
12685 let md = to_markdown(&doc).unwrap();
12686 assert!(md.contains("- First item"));
12687 assert!(md.contains("- Second item"));
12688 assert!(!md.contains("- • First item"));
12689 assert!(!md.contains("\n- •\n"));
12690 assert!(!md.contains("\n- 133\n"));
12691 }
12692
12693 #[test]
12694 fn test_list_renderer_merges_wrapped_continuation_items() {
12695 let mut doc = PdfDocument::new("wrapped-list.pdf".to_string());
12696 doc.kids.push(make_fallback_list(&[
12697 "Use a micropipette to add 2 μL of loading dye",
12698 "and down a couple of times to mix the loading dye with the digested DNA.",
12699 "Use a fresh pipet tip for each reaction tube.",
12700 ]));
12701
12702 let md = to_markdown(&doc).unwrap();
12703 assert!(md.contains(
12704 "- Use a micropipette to add 2 μL of loading dye and down a couple of times to mix the loading dye with the digested DNA."
12705 ));
12706 assert!(md.contains("- Use a fresh pipet tip for each reaction tube."));
12707 assert!(!md.contains("\n- and down"));
12708 }
12709
12710 #[test]
12711 fn test_list_renderer_keeps_enumerated_items_separate() {
12712 let mut doc = PdfDocument::new("enumerated-list.pdf".to_string());
12713 doc.kids.push(make_fallback_list(&[
12714 "iii. Looking at cost items, the cost of raw woods procurement will be highest share.",
12715 "iv. This business model will be operating cost-oriented not capital cost-oriented.",
12716 "v. Assumed selling price of wood pellet is $100 per tonne and appropriate.",
12717 ]));
12718
12719 let md = to_markdown(&doc).unwrap();
12720 assert!(md.contains("iii. Looking at cost items, the cost of raw woods procurement will be highest share.\niv. This business model will be operating cost-oriented not capital cost-oriented.\nv. Assumed selling price of wood pellet is $100 per tonne and appropriate."));
12721 assert!(!md.contains("- iii."));
12722 }
12723
12724 #[test]
12725 fn test_postprocess_drops_isolated_single_char_noise_lines() {
12726 let markdown = "# The Data Journey\n\n1\n\nTo get started.\n\no\n\nNOTE: Keep going.\n";
12727 let cleaned = drop_isolated_noise_lines(markdown);
12728 assert!(!cleaned.contains("\n1\n"));
12729 assert!(!cleaned.contains("\no\n"));
12730 assert!(cleaned.contains("To get started."));
12731 assert!(cleaned.contains("NOTE: Keep going."));
12732 }
12733
12734 fn make_two_column_table(rows: &[(&str, &str)]) -> ContentElement {
12735 let mut table_rows = Vec::new();
12736 for (row_number, (left, right)) in rows.iter().enumerate() {
12737 let top = 656.0 - row_number as f64 * 18.0;
12738 let bottom = top - 16.0;
12739 let mut cells = Vec::new();
12740 for (col_number, (text, left_x, right_x)) in
12741 [(*left, 72.0, 220.0), (*right, 220.0, 420.0)]
12742 .into_iter()
12743 .enumerate()
12744 {
12745 let content = if text.is_empty() {
12746 Vec::new()
12747 } else {
12748 vec![TableToken {
12749 base: TextChunk {
12750 value: text.to_string(),
12751 bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
12752 font_name: "Test".to_string(),
12753 font_size: 11.0,
12754 font_weight: 400.0,
12755 italic_angle: 0.0,
12756 font_color: "[0.0]".to_string(),
12757 contrast_ratio: 21.0,
12758 symbol_ends: Vec::new(),
12759 text_format: TextFormat::Normal,
12760 text_type: TextType::Regular,
12761 pdf_layer: PdfLayer::Main,
12762 ocg_visible: true,
12763 index: None,
12764 page_number: Some(1),
12765 level: None,
12766 mcid: None,
12767 },
12768 token_type: TableTokenType::Text,
12769 }]
12770 };
12771 cells.push(TableBorderCell {
12772 bbox: BoundingBox::new(Some(1), left_x, bottom, right_x, top),
12773 index: None,
12774 level: None,
12775 row_number,
12776 col_number,
12777 row_span: 1,
12778 col_span: 1,
12779 content,
12780 contents: vec![],
12781 semantic_type: None,
12782 });
12783 }
12784
12785 table_rows.push(TableBorderRow {
12786 bbox: BoundingBox::new(Some(1), 72.0, bottom, 420.0, top),
12787 index: None,
12788 level: None,
12789 row_number,
12790 cells,
12791 semantic_type: None,
12792 });
12793 }
12794
12795 ContentElement::TableBorder(TableBorder {
12796 bbox: BoundingBox::new(
12797 Some(1),
12798 72.0,
12799 656.0 - rows.len() as f64 * 18.0 - 16.0,
12800 420.0,
12801 656.0,
12802 ),
12803 index: None,
12804 level: Some("1".to_string()),
12805 x_coordinates: vec![72.0, 220.0, 420.0],
12806 x_widths: vec![0.0; 3],
12807 y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
12808 y_widths: vec![0.0; rows.len() + 1],
12809 rows: table_rows,
12810 num_rows: rows.len(),
12811 num_columns: 2,
12812 is_bad_table: false,
12813 is_table_transformer: false,
12814 previous_table: None,
12815 next_table: None,
12816 })
12817 }
12818
12819 fn make_chunked_paragraph_line(
12820 segments: &[(&str, f64, f64)],
12821 bottom: f64,
12822 top: f64,
12823 ) -> ContentElement {
12824 let bbox = BoundingBox::new(
12825 Some(1),
12826 segments.first().map(|(_, left, _)| *left).unwrap_or(72.0),
12827 bottom,
12828 segments.last().map(|(_, _, right)| *right).unwrap_or(320.0),
12829 top,
12830 );
12831
12832 let chunks = segments
12833 .iter()
12834 .map(|(text, left, right)| TextChunk {
12835 value: (*text).to_string(),
12836 bbox: BoundingBox::new(Some(1), *left, bottom, *right, top),
12837 font_name: "Lato-Regular".to_string(),
12838 font_size: top - bottom,
12839 font_weight: 400.0,
12840 italic_angle: 0.0,
12841 font_color: "#000000".to_string(),
12842 contrast_ratio: 21.0,
12843 symbol_ends: vec![],
12844 text_format: TextFormat::Normal,
12845 text_type: TextType::Regular,
12846 pdf_layer: PdfLayer::Main,
12847 ocg_visible: true,
12848 index: None,
12849 page_number: Some(1),
12850 level: None,
12851 mcid: None,
12852 })
12853 .collect::<Vec<_>>();
12854
12855 let line = TextLine {
12856 bbox: bbox.clone(),
12857 index: None,
12858 level: None,
12859 font_size: top - bottom,
12860 base_line: bottom + 2.0,
12861 slant_degree: 0.0,
12862 is_hidden_text: false,
12863 text_chunks: chunks,
12864 is_line_start: true,
12865 is_line_end: true,
12866 is_list_line: false,
12867 connected_line_art_label: None,
12868 };
12869 let block = TextBlock {
12870 bbox: bbox.clone(),
12871 index: None,
12872 level: None,
12873 font_size: line.font_size,
12874 base_line: line.base_line,
12875 slant_degree: 0.0,
12876 is_hidden_text: false,
12877 text_lines: vec![line],
12878 has_start_line: true,
12879 has_end_line: true,
12880 text_alignment: None,
12881 };
12882 let column = TextColumn {
12883 bbox: bbox.clone(),
12884 index: None,
12885 level: None,
12886 font_size: block.font_size,
12887 base_line: block.base_line,
12888 slant_degree: 0.0,
12889 is_hidden_text: false,
12890 text_blocks: vec![block],
12891 };
12892
12893 ContentElement::Paragraph(SemanticParagraph {
12894 base: SemanticTextNode {
12895 bbox,
12896 index: None,
12897 level: None,
12898 semantic_type: SemanticType::Paragraph,
12899 correct_semantic_score: None,
12900 columns: vec![column],
12901 font_weight: Some(400.0),
12902 font_size: Some(top - bottom),
12903 text_color: None,
12904 italic_angle: None,
12905 font_name: Some("Lato-Regular".to_string()),
12906 text_format: None,
12907 max_font_size: Some(top - bottom),
12908 background_color: None,
12909 is_hidden_text: false,
12910 },
12911 enclosed_top: false,
12912 enclosed_bottom: false,
12913 indentation: 0,
12914 })
12915 }
12916
12917 fn make_n_column_table(rows: &[Vec<&str>], column_bounds: &[(f64, f64)]) -> ContentElement {
12918 let mut table_rows = Vec::new();
12919 for (row_number, row_values) in rows.iter().enumerate() {
12920 let top = 656.0 - row_number as f64 * 18.0;
12921 let bottom = top - 16.0;
12922 let mut cells = Vec::new();
12923 for (col_number, (left_x, right_x)) in column_bounds.iter().enumerate() {
12924 let text = row_values.get(col_number).copied().unwrap_or("");
12925 let content = if text.is_empty() {
12926 Vec::new()
12927 } else {
12928 vec![TableToken {
12929 base: TextChunk {
12930 value: text.to_string(),
12931 bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
12932 font_name: "Test".to_string(),
12933 font_size: 11.0,
12934 font_weight: 400.0,
12935 italic_angle: 0.0,
12936 font_color: "[0.0]".to_string(),
12937 contrast_ratio: 21.0,
12938 symbol_ends: Vec::new(),
12939 text_format: TextFormat::Normal,
12940 text_type: TextType::Regular,
12941 pdf_layer: PdfLayer::Main,
12942 ocg_visible: true,
12943 index: None,
12944 page_number: Some(1),
12945 level: None,
12946 mcid: None,
12947 },
12948 token_type: TableTokenType::Text,
12949 }]
12950 };
12951 cells.push(TableBorderCell {
12952 bbox: BoundingBox::new(Some(1), *left_x, bottom, *right_x, top),
12953 index: None,
12954 level: None,
12955 row_number,
12956 col_number,
12957 row_span: 1,
12958 col_span: 1,
12959 content,
12960 contents: vec![],
12961 semantic_type: None,
12962 });
12963 }
12964
12965 table_rows.push(TableBorderRow {
12966 bbox: BoundingBox::new(
12967 Some(1),
12968 column_bounds.first().map(|(left, _)| *left).unwrap_or(72.0),
12969 bottom,
12970 column_bounds
12971 .last()
12972 .map(|(_, right)| *right)
12973 .unwrap_or(420.0),
12974 top,
12975 ),
12976 index: None,
12977 level: None,
12978 row_number,
12979 cells,
12980 semantic_type: None,
12981 });
12982 }
12983
12984 let left = column_bounds
12985 .first()
12986 .map(|(value, _)| *value)
12987 .unwrap_or(72.0);
12988 let right = column_bounds
12989 .last()
12990 .map(|(_, value)| *value)
12991 .unwrap_or(420.0);
12992 let x_coordinates = std::iter::once(left)
12993 .chain(column_bounds.iter().map(|(_, right)| *right))
12994 .collect::<Vec<_>>();
12995
12996 ContentElement::TableBorder(TableBorder {
12997 bbox: BoundingBox::new(
12998 Some(1),
12999 left,
13000 656.0 - rows.len() as f64 * 18.0 - 16.0,
13001 right,
13002 656.0,
13003 ),
13004 index: None,
13005 level: Some("1".to_string()),
13006 x_coordinates,
13007 x_widths: vec![0.0; column_bounds.len() + 1],
13008 y_coordinates: (0..=rows.len()).map(|i| 656.0 - i as f64 * 18.0).collect(),
13009 y_widths: vec![0.0; rows.len() + 1],
13010 rows: table_rows,
13011 num_rows: rows.len(),
13012 num_columns: column_bounds.len(),
13013 is_bad_table: false,
13014 is_table_transformer: false,
13015 previous_table: None,
13016 next_table: None,
13017 })
13018 }
13019
13020 #[test]
13021 fn test_numeric_two_column_table_is_not_misrendered_as_toc() {
13022 let mut doc = PdfDocument::new("cec-table.pdf".to_string());
13023 doc.number_of_pages = 1;
13024 doc.kids.push(make_two_column_table(&[
13025 ("Mineral or colloid type", "CEC of pure colloid"),
13026 ("", "cmolc/kg"),
13027 ("kaolinite", "10"),
13028 ("illite", "30"),
13029 ]));
13030
13031 let md = to_markdown(&doc).unwrap();
13032 assert!(md.contains("| --- | --- |"));
13033 assert!(md.contains("| kaolinite | 10 |"));
13034 }
13035
13036 #[test]
13037 fn test_blank_right_column_table_is_not_misrendered_as_toc() {
13038 let mut doc = PdfDocument::new("flocculation-table.pdf".to_string());
13039 doc.number_of_pages = 1;
13040 doc.kids.push(make_two_column_table(&[
13041 (
13042 "Added cation",
13043 "Relative Size & Settling Rates of Floccules",
13044 ),
13045 ("K+", ""),
13046 ("Na+", ""),
13047 ("Ca2+", ""),
13048 ]));
13049
13050 let md = to_markdown(&doc).unwrap();
13051 assert!(md.contains("| Added cation | Relative Size & Settling Rates of Floccules |"));
13052 assert!(md.contains("| K+ | |"));
13053 }
13054
13055 #[test]
13056 fn test_infographic_card_table_renders_as_numbered_item() {
13057 let mut doc = PdfDocument::new("infographic-card.pdf".to_string());
13058 doc.number_of_pages = 1;
13059 doc.kids.push(make_two_column_table(&[
13060 (
13061 "1",
13062 "We're all both consumers and creators of creative work.",
13063 ),
13064 (
13065 "",
13066 "As consumers, we watch movies, listen to music, read books, and more.",
13067 ),
13068 ]));
13069
13070 let md = to_markdown(&doc).unwrap();
13071 assert!(md.contains(
13072 "1. We're all both consumers and creators of creative work. As consumers, we watch movies, listen to music, read books, and more."
13073 ));
13074 assert!(!md.contains("| 1 |"));
13075 }
13076
13077 #[test]
13078 fn test_grouped_header_rows_are_preserved_without_flattening() {
13079 let mut doc = PdfDocument::new("grouped-header.pdf".to_string());
13080 doc.number_of_pages = 1;
13081 doc.kids.push(make_n_column_table(
13082 &[
13083 vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
13084 vec![
13085 "",
13086 "Alpaca-GPT4",
13087 "OpenOrca",
13088 "Synth. Math-Instruct",
13089 "Orca DPO Pairs",
13090 "Ultrafeedback Cleaned",
13091 "Synth. Math-Alignment",
13092 ],
13093 vec![
13094 "Total # Samples",
13095 "52K",
13096 "2.91M",
13097 "126K",
13098 "12.9K",
13099 "60.8K",
13100 "126K",
13101 ],
13102 ],
13103 &[
13104 (72.0, 120.0),
13105 (120.0, 170.0),
13106 (170.0, 220.0),
13107 (220.0, 280.0),
13108 (280.0, 340.0),
13109 (340.0, 410.0),
13110 (410.0, 470.0),
13111 ],
13112 ));
13113
13114 let md = to_markdown(&doc).unwrap();
13115 assert!(md.contains(
13116 "| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"
13117 ));
13118 assert!(md.contains(
13119 "| | Alpaca-GPT4 | OpenOrca | Synth. Math-Instruct | Orca DPO Pairs | Ultrafeedback Cleaned | Synth. Math-Alignment |"
13120 ));
13121 assert!(!md.contains("Instruction OpenOrca"));
13122 assert!(!md.contains("Alignment Ultrafeedback"));
13123 }
13124
13125 #[test]
13126 fn test_top_table_plate_renderer_stops_before_article_body() {
13127 let mut doc = PdfDocument::new("table-plate.pdf".to_string());
13128 doc.number_of_pages = 1;
13129 doc.kids
13130 .push(make_paragraph_at(72.0, 724.0, 200.0, 736.0, "SOLAR 10.7B"));
13131 doc.kids.push(make_paragraph_at(
13132 72.0,
13133 704.0,
13134 220.0,
13135 716.0,
13136 "Training datasets",
13137 ));
13138 doc.kids.push(make_n_column_table(
13139 &[
13140 vec!["Properties", "", "Instruction", "", "", "Alignment", ""],
13141 vec![
13142 "",
13143 "Alpaca-GPT4",
13144 "OpenOrca",
13145 "Synth. Math-Instruct",
13146 "Orca DPO Pairs",
13147 "Ultrafeedback Cleaned",
13148 "Synth. Math-Alignment",
13149 ],
13150 vec![
13151 "Total # Samples",
13152 "52K",
13153 "2.91M",
13154 "126K",
13155 "12.9K",
13156 "60.8K",
13157 "126K",
13158 ],
13159 vec![
13160 "Maximum # Samples Used",
13161 "52K",
13162 "100K",
13163 "52K",
13164 "12.9K",
13165 "60.8K",
13166 "20.1K",
13167 ],
13168 vec!["Open Source", "O", "O", "✗", "O", "O", "✗"],
13169 ],
13170 &[
13171 (78.0, 125.0),
13172 (125.0, 175.0),
13173 (175.0, 225.0),
13174 (225.0, 285.0),
13175 (285.0, 345.0),
13176 (345.0, 415.0),
13177 (415.0, 490.0),
13178 ],
13179 ));
13180 doc.kids.push(make_paragraph_at(
13181 72.0,
13182 500.0,
13183 310.0,
13184 514.0,
13185 "Table 1: Training datasets used for the instruction and alignment tuning stages, respectively.",
13186 ));
13187 doc.kids.push(make_paragraph_at(
13188 286.0,
13189 484.0,
13190 526.0,
13191 498.0,
13192 "Open source indicates whether the dataset is open-sourced.",
13193 ));
13194 doc.kids.push(make_paragraph_at(
13195 72.0,
13196 360.0,
13197 290.0,
13198 388.0,
13199 "Comparison to other up-scaling methods. Unlike Komatsuzaki et al. (2022)...",
13200 ));
13201
13202 let md = to_markdown(&doc).unwrap();
13203 assert!(md.contains("Table 1: Training datasets used for the instruction"));
13204 assert!(md.contains("| Properties | Instruction | Instruction | Instruction | Alignment | Alignment | Alignment |"));
13205 assert!(!md.contains("Comparison to other up-scaling methods"));
13206 }
13207
13208 #[test]
13209 fn test_late_section_boundary_renderer_drops_equation_carryover() {
13210 let mut doc = PdfDocument::new("late-section.pdf".to_string());
13211 doc.number_of_pages = 1;
13212 doc.kids.push(make_paragraph_at(
13213 72.0,
13214 700.0,
13215 540.0,
13216 714.0,
13217 "The horizontal distance traveled by the jet is equal to:",
13218 ));
13219 doc.kids.push(make_paragraph_at(
13220 72.0,
13221 640.0,
13222 540.0,
13223 654.0,
13224 "The vertical position of the jet may be calculated as:",
13225 ));
13226 doc.kids.push(make_paragraph_at(
13227 72.0,
13228 580.0,
13229 260.0,
13230 594.0,
13231 "Rearranging Equation (8) gives:",
13232 ));
13233 doc.kids.push(make_paragraph_at(
13234 72.0,
13235 520.0,
13236 420.0,
13237 534.0,
13238 "Substitution into Equation 7 results in:",
13239 ));
13240 doc.kids.push(make_paragraph_at(
13241 72.0,
13242 460.0,
13243 280.0,
13244 474.0,
13245 "Equations (10) can be rearranged to find Cv:",
13246 ));
13247 doc.kids.push(make_heading_at(
13248 72.0,
13249 350.0,
13250 420.0,
13251 366.0,
13252 "7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE",
13253 ));
13254 doc.kids.push(make_paragraph_at(
13255 72.0,
13256 326.0,
13257 380.0,
13258 340.0,
13259 "If C_d is assumed to be constant, then a graph of Q plotted against",
13260 ));
13261 doc.kids.push(make_paragraph_at(
13262 400.0,
13263 326.0,
13264 540.0,
13265 340.0,
13266 "(Equation 6) will be linear, and",
13267 ));
13268 doc.kids.push(make_paragraph_at(
13269 72.0,
13270 310.0,
13271 240.0,
13272 324.0,
13273 "the slope of this graph will be:",
13274 ));
13275 doc.kids.push(make_paragraph_at(
13276 360.0,
13277 36.0,
13278 550.0,
13279 48.0,
13280 "EXPERIMENT #6: ORIFICE AND FREE JET FLOW 53",
13281 ));
13282
13283 let md = to_markdown(&doc).unwrap();
13284 assert!(md.starts_with("# 7.2. DETERMINATION OF THE COEFFICIENT OF DISCHARGE"));
13285 assert!(md.contains(
13286 "If C_d is assumed to be constant, then a graph of Q plotted against (Equation 6) will be linear, and the slope of this graph will be:"
13287 ));
13288 assert!(!md.contains("The horizontal distance traveled by the jet"));
13289 assert!(!md.contains("EXPERIMENT #6"));
13290 }
13291
13292 #[test]
13293 fn test_leading_table_carryover_row_is_trimmed_from_general_renderer() {
13294 let mut doc = PdfDocument::new("carryover-table.pdf".to_string());
13295 doc.number_of_pages = 1;
13296 doc.kids.push(make_n_column_table(
13297 &[
13298 vec![
13299 "Jurisdiction",
13300 "GATS XVII Reservation (1994)",
13301 "Foreign Ownership Permitted",
13302 "Restrictions on Foreign Ownership",
13303 "Foreign Ownership Reporting Requirements",
13304 ],
13305 vec![
13306 "",
13307 "",
13308 "",
13309 "right required to acquire desert lands and continue the prior page",
13310 "",
13311 ],
13312 vec!["Finland", "N", "Y", "Prior approval may be required.", ""],
13313 vec!["France", "N", "Y", "None.", ""],
13314 ],
13315 &[
13316 (72.0, 150.0),
13317 (150.0, 235.0),
13318 (235.0, 330.0),
13319 (330.0, 500.0),
13320 (500.0, 560.0),
13321 ],
13322 ));
13323
13324 let md = to_markdown(&doc).unwrap();
13325 assert!(!md.contains("right required to acquire desert lands"));
13326 assert!(md.contains("| Finland | N | Y | Prior approval may be required. | |"));
13327 }
13328
13329 #[test]
13330 fn test_single_table_report_renderer_promotes_title_and_skips_footer() {
13331 let mut doc = PdfDocument::new("single-table-report.pdf".to_string());
13332 doc.number_of_pages = 1;
13333 doc.kids.push(make_paragraph_at(
13334 140.0,
13335 674.0,
13336 474.0,
13337 688.0,
13338 "Restrictions on Land Ownership by Foreigners in Selected Jurisdictions",
13339 ));
13340 doc.kids.push(make_n_column_table(
13341 &[
13342 vec![
13343 "Jurisdiction",
13344 "GATS XVII Reservation (1994)",
13345 "Foreign Ownership Permitted",
13346 "Restrictions on Foreign Ownership",
13347 "Foreign Ownership Reporting Requirements",
13348 ],
13349 vec![
13350 "",
13351 "",
13352 "",
13353 "right required to acquire desert lands and continue the prior page",
13354 "",
13355 ],
13356 vec![
13357 "Finland",
13358 "N",
13359 "Y",
13360 "Prior approval from the Government of Aland may be required.",
13361 "",
13362 ],
13363 vec!["France", "N", "Y", "None.", ""],
13364 ],
13365 &[
13366 (72.0, 150.0),
13367 (150.0, 235.0),
13368 (235.0, 330.0),
13369 (330.0, 500.0),
13370 (500.0, 560.0),
13371 ],
13372 ));
13373 doc.kids.push(make_paragraph_at(
13374 350.0,
13375 36.0,
13376 548.0,
13377 48.0,
13378 "The Law Library of Congress 7",
13379 ));
13380
13381 let md = to_markdown(&doc).unwrap();
13382 assert!(md.starts_with(
13383 "# Restrictions on Land Ownership by Foreigners in Selected Jurisdictions"
13384 ));
13385 assert!(!md.contains("right required to acquire desert lands"));
13386 assert!(!md.contains("The Law Library of Congress 7"));
13387 assert!(md.contains(
13388 "| Finland | N | Y | Prior approval from the Government of Aland may be required. | |"
13389 ));
13390 }
13391
13392 #[test]
13393 fn test_geometric_panel_headers_are_promoted_into_table() {
13394 let mut doc = PdfDocument::new("ai-pack-panel.pdf".to_string());
13395 doc.kids.push(make_chunked_paragraph_line(
13396 &[("OCR", 220.0, 250.0)],
13397 720.0,
13398 732.0,
13399 ));
13400 doc.kids.push(make_chunked_paragraph_line(
13401 &[("Recommendation", 430.0, 540.0)],
13402 720.0,
13403 732.0,
13404 ));
13405 doc.kids.push(make_chunked_paragraph_line(
13406 &[("Product semantic search", 660.0, 860.0)],
13407 720.0,
13408 732.0,
13409 ));
13410 doc.kids.push(make_chunked_paragraph_line(
13411 &[("Pack", 72.0, 110.0)],
13412 684.0,
13413 696.0,
13414 ));
13415 doc.kids.push(make_chunked_paragraph_line(
13416 &[("A solution that recognizes characters", 140.0, 340.0)],
13417 684.0,
13418 696.0,
13419 ));
13420 doc.kids.push(make_chunked_paragraph_line(
13421 &[("A solution that recommends the best products", 390.0, 620.0)],
13422 684.0,
13423 696.0,
13424 ));
13425 doc.kids.push(make_chunked_paragraph_line(
13426 &[("A solution that enables semantic search", 650.0, 900.0)],
13427 684.0,
13428 696.0,
13429 ));
13430 doc.kids.push(make_n_column_table(
13431 &[
13432 vec![
13433 "Achieved 1st place in the OCR World Competition",
13434 "Team with specialists and technologies",
13435 "Creation of the first natural language evaluation",
13436 ],
13437 vec![
13438 "The team includes specialists who have",
13439 "received Kaggle's Gold Medal recommendation",
13440 "system in Korean (KLUE)",
13441 ],
13442 vec![
13443 "presented 14 papers in renowned AI conferences",
13444 "top-tier recommendation",
13445 "Shopee subject",
13446 ],
13447 ],
13448 &[(120.0, 360.0), (360.0, 630.0), (630.0, 910.0)],
13449 ));
13450 doc.kids.push(make_chunked_paragraph_line(
13451 &[("models", 430.0, 490.0)],
13452 552.0,
13453 564.0,
13454 ));
13455
13456 let md = to_markdown(&doc).unwrap();
13457 assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
13458 assert!(md.contains("| A solution that recognizes characters | A solution that recommends the best products | A solution that enables semantic search |"));
13459 assert!(md.contains(
13460 "received Kaggle's Gold Medal recommendation top-tier recommendation models"
13461 ));
13462 }
13463
13464 #[test]
13465 fn test_embedded_stub_header_is_promoted_from_first_table_column() {
13466 let mut doc = PdfDocument::new("embedded-stub-header.pdf".to_string());
13467 doc.kids.push(make_chunked_paragraph_line(
13468 &[("OCR", 220.0, 250.0)],
13469 720.0,
13470 732.0,
13471 ));
13472 doc.kids.push(make_chunked_paragraph_line(
13473 &[("Recommendation", 430.0, 540.0)],
13474 720.0,
13475 732.0,
13476 ));
13477 doc.kids.push(make_chunked_paragraph_line(
13478 &[("Product semantic search", 660.0, 860.0)],
13479 720.0,
13480 732.0,
13481 ));
13482 doc.kids.push(make_n_column_table(
13483 &[
13484 vec![
13485 "Pack",
13486 "A solution that recognizes characters in an image and extracts necessary information",
13487 "A solution that recommends the best products and contents",
13488 "A solution that enables semantic search and organizes key information",
13489 ],
13490 vec![
13491 "Application",
13492 "Applicable to all fields that require text extraction",
13493 "Applicable to all fields that use any form of recommendation",
13494 "Applicable to all fields that deal with unstructured data",
13495 ],
13496 vec![
13497 "Highlight",
13498 "Achieved 1st place in the OCR World Competition",
13499 "Received Kaggle's Gold Medal recommendation",
13500 "Creation of the first natural language evaluation system in Korean",
13501 ],
13502 ],
13503 &[
13504 (72.0, 120.0),
13505 (120.0, 360.0),
13506 (360.0, 630.0),
13507 (630.0, 910.0),
13508 ],
13509 ));
13510
13511 let md = to_markdown(&doc).unwrap();
13512 assert!(md.contains("| Pack | OCR | Recommendation | Product semantic search |"));
13513 assert!(
13514 md.contains("| Application | Applicable to all fields that require text extraction |")
13515 );
13516 assert!(md.contains("| Highlight | Achieved 1st place in the OCR World Competition |"));
13517 assert!(!md.contains("OCR\n\nRecommendation\n\nProduct semantic search"));
13518 }
13519
13520 #[test]
13521 fn test_geometric_chunk_alignment_splits_header_line_into_columns() {
13522 let line = make_chunked_paragraph_line(
13523 &[
13524 ("Properties", 72.0, 145.0),
13525 ("Instruction", 180.0, 255.0),
13526 ("Alignment", 480.0, 545.0),
13527 ],
13528 720.0,
13529 732.0,
13530 );
13531 let chunk_lines = extract_chunk_lines(&line);
13532 let fragments = split_line_into_slot_fragments(
13533 &chunk_lines[0],
13534 &[
13535 (72.0, 170.0),
13536 (170.0, 280.0),
13537 (280.0, 380.0),
13538 (380.0, 480.0),
13539 (480.0, 600.0),
13540 (600.0, 720.0),
13541 (720.0, 850.0),
13542 ],
13543 );
13544
13545 assert_eq!(fragments.len(), 3);
13546 assert_eq!(fragments[0].slot_idx, 0);
13547 assert_eq!(fragments[0].text, "Properties");
13548 assert_eq!(fragments[1].slot_idx, 1);
13549 assert_eq!(fragments[1].text, "Instruction");
13550 assert_eq!(fragments[2].slot_idx, 4);
13551 assert_eq!(fragments[2].text, "Alignment");
13552 }
13553
13554 #[test]
13555 fn test_merge_tables_across_heading() {
13556 let input = "some text\n\n\
13557 | Area | Competence |\n\
13558 | --- | --- |\n\
13559 | Row1 | Val1 |\n\
13560 | Row2 | Val2 |\n\
13561 \n\
13562 # Heading Between\n\
13563 \n\
13564 | Row3 | Val3 |\n\
13565 | --- | --- |\n\
13566 \n\
13567 more text\n";
13568 let result = merge_adjacent_pipe_tables(input);
13569 assert!(
13571 result.contains("| Heading Between |"),
13572 "Heading should be in pipe row: {}",
13573 result
13574 );
13575 assert!(
13577 !result.contains("# Heading Between"),
13578 "Heading marker should be removed: {}",
13579 result
13580 );
13581 assert!(
13583 result.contains("| Row3 |") || result.contains("Row3"),
13584 "Row3 should exist: {}",
13585 result
13586 );
13587 }
13588
13589 #[test]
13590 fn test_merge_tables_does_not_cross_distinct_headers() {
13591 let input = "| Model | Score |\n\
13592 | --- | --- |\n\
13593 | A | 1 |\n\
13594 \n\
13595 Table 6: Performance comparison amongst the merge candidates.\n\
13596 \n\
13597 | Model | Method | Score |\n\
13598 | --- | --- | --- |\n\
13599 | B | Avg | 2 |\n";
13600 let result = merge_adjacent_pipe_tables(input);
13601
13602 assert!(result.contains("Table 6: Performance comparison amongst the merge candidates."));
13603 assert!(result.contains("| Model | Score |"));
13604 assert!(result.contains("| Model | Method | Score |"));
13605 assert!(
13606 !result.contains("| Table 6: Performance comparison amongst the merge candidates. |")
13607 );
13608 }
13609
13610 #[test]
13611 fn test_normalize_chart_like_markdown_extracts_series_tables() {
13612 let input = "Figure 1.7. Non-citizen population in Malaysia (in thousands) 3,323 3,500 3,288 3,230 3,140 2,907 3,000 2,693 2,500 2,000 1,500 1,000 500 0\n\n\
13613 2016 2017 2018 2019 2020 2021 Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.\n\n\
13614 ASEAN Migration Outlook 19\n";
13615
13616 let normalized = normalize_chart_like_markdown(input);
13617 assert!(
13618 normalized.contains("## Figure 1.7. Non-citizen population in Malaysia (in thousands)")
13619 );
13620 assert!(normalized.contains("| 2016 | 3,323 |"));
13621 assert!(normalized.contains("| 2021 | 2,693 |"));
13622 assert!(normalized.contains(
13623 "*Source: Department of Statistics, Malaysia (2022). Figure for 2021 is an estimate.*"
13624 ));
13625 assert!(!normalized.contains("ASEAN Migration Outlook 19"));
13626 }
13627
13628 #[test]
13629 fn test_normalize_chart_like_markdown_promotes_structural_captions() {
13630 let input = "Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or\n\n\
13631 The Wonderful Lamp.\n\n\
13632 Body paragraph.\n";
13633
13634 let normalized = normalize_chart_like_markdown(input);
13635 assert!(normalized.contains(
13636 "## Figure 5.1 Mr. Bologna Jun-r as Kalim Azack in Aladdin, or The Wonderful Lamp"
13637 ));
13638 assert!(normalized.contains("Body paragraph."));
13639 }
13640
13641 #[test]
13642 fn test_normalize_chart_like_markdown_reconstructs_header_pair_chart_table() {
13643 let input = "Figure 4.8. Domestic Wood Pellets Production\n\n\
13644 | 8 | 800 200 | 126 2014 | 120 2015 | 120 2016 | 127 2017 | 131 2018 | 147 2019 |\n\
13645 | --- | --- | --- | --- | --- | --- | --- | --- |\n\n\
13646 Source: Forestry Agency, Ministry of Agriculture, Forestry and Fishery (MAFF), 2020.\n";
13647
13648 let normalized = normalize_chart_like_markdown(input);
13649 assert!(normalized.contains("# Figure 4.8. Domestic Wood Pellets Production"));
13650 assert!(normalized.contains("| Year | Domestic Wood Pellets Production |"));
13651 assert!(normalized.contains("| 2014 | 126 |"));
13652 assert!(normalized.contains("| 2019 | 147 |"));
13653 assert!(!normalized.contains("| 8 | 800 200 |"));
13654 }
13655
13656 #[test]
13657 fn test_normalize_chart_like_markdown_drops_numeric_axis_artifact_table() {
13658 let input = "| 31 1 0 2 23 2 2 2 0 5 10 15 20 25 30 35 Event Celebration Information Videograph 2019 2020 |\n\
13659 | --- |\n\n\
13660 Distribution of Komnas HAM's YouTube Content (2019-2020)\n";
13661
13662 let normalized = normalize_chart_like_markdown(input);
13663 assert!(!normalized.contains("| --- |"));
13664 assert!(normalized.contains("Distribution of Komnas HAM's YouTube Content (2019-2020)"));
13665 }
13666
13667 #[test]
13668 fn test_normalize_chart_like_markdown_drops_url_fragment_table() {
13669 let input = "## Figure 6 DPN Argentina Content: World Health Day Celebration\n\n\
13670 | na/status/1379765916259483648 |\n\
13671 | --- |\n\n\
13672 98 DPN Argentina, accessed on 5 December 2021.\n";
13673
13674 let normalized = normalize_chart_like_markdown(input);
13675 assert!(!normalized.contains("/status/1379765916259483648 |"));
13676 assert!(normalized.contains("98 DPN Argentina, accessed on 5 December 2021."));
13677 }
13678
13679 #[test]
13680 fn test_normalize_chart_like_markdown_drops_sparse_table_before_caption() {
13681 let input = "What’s unique about the growth of Alligator Gars is their fast growth.\n\n\
13682 | in | cm | | Length | of | Gar | Fish | Age |\n\
13683 | --- | --- | --- | --- | --- | --- | --- | --- |\n\
13684 | 120) | 300 | | | | | | |\n\
13685 | 100+ | 250 | | | | | | |\n\
13686 | 80+ | 200 | | | | | | |\n\
13687 | 20. | 50 | G | | | | | Vi |\n\
13688 | 0 | 0 | | | | | | |\n\
13689 | | 0 | 10 | 30 | | 40 | 50 | 60 |\n\n\
13690 Figure 8.6: Growth in length of Alligator Gar in Texas.\n";
13691
13692 let normalized = normalize_chart_like_markdown(input);
13693 assert!(!normalized.contains("| in | cm |"));
13694 assert!(normalized.contains("Figure 8.6: Growth in length of Alligator Gar in Texas."));
13695 }
13696
13697 #[test]
13698 fn test_normalize_chart_like_markdown_trims_large_top_table_plate() {
13699 let input = "| A | B | C | D | E | F | G | H |\n\
13700 | --- | --- | --- | --- | --- | --- | --- | --- |\n\
13701 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13702 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13703 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13704 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13705 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13706 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13707 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\
13708 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |\n\n\
13709 Table 2: Evaluation results for SOLAR 10.7B and SOLAR 10.7B-Instruct along with other top-performing models in the paper.\n\n\
13710 # 4.2 Main Results\n\n\
13711 The surrounding prose should be dropped.\n";
13712
13713 let normalized = normalize_chart_like_markdown(input);
13714 assert!(normalized.starts_with("| A | B | C | D | E | F | G | H |"));
13715 assert!(!normalized.contains("Table 2:"));
13716 assert!(!normalized.contains("4.2 Main Results"));
13717 assert!(!normalized.contains("surrounding prose"));
13718 }
13719}